nk-test / newScrapeAllMusic.js
asylwan's picture
Upload folder using huggingface_hub
c51674d
raw
history blame contribute delete
No virus
8.65 kB
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
class Artist {
constructor(data) {
this.name = '';
this.url = '';
this.biography = '';
this.discography = [];
if (data) {
for (const prop in data) {
if (data.hasOwnProperty(prop)) {
this[prop] = data[prop];
}
}
}
}
}
class Record {
constructor(data) {
this.artist = '';
this.title = '';
this.label = '';
this.url = '';
this.rating = 0;
this.year = '';
this.genre = '';
this.text = '';
this.styles = [];
this.tracks = [];
if (data) {
for (const prop in data) {
if (data.hasOwnProperty(prop)) {
this[prop] = data[prop];
}
}
}
}
}
class MusicScraper {
constructor() {
this.artistsToGet = 5;
this.artistsFetched = [];
this.relatedArtists = [];
this.artists = [];
/* Create a timestamped folder name */
this.timestampedDirectoryName = this.getTimestampedDirectoryName();
process.on('SIGINT', () => {
// Run your cleanup or other desired method before exiting
console.log('Exiting the script...');
this.done();
process.exit(0); // Exit gracefully
});
}
getTimestampedDirectoryName() {
const now = new Date();
const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, '');
return `${timestamp}_data`;
}
async fetchData(url) {
if (!url) {
console.log(`Can't fetch URL since it's null`);
return false;
}
try {
const response = await axios.get(url);
return response.data;
} catch (error) {
throw new Error(`Error fetching data: ${error.message}`);
}
}
generateUrl(baseUrl, type) {
return `${baseUrl}/${type}`;
}
cleanText(text) {
// Replace consecutive spaces and newlines with a single space
var newText = text.replace(/\s+/g, ' ').trim();
newText = newText.replace(/[\\"]/g, '');
return newText;
}
async getRelatedArtists(Artist) {
try {
const html = await this.fetchData(this.generateUrl(Artist.url, 'related'));
var relatedArtists = this.scrapeRelatedArtistData(html);
return relatedArtists;
} catch (error) {
console.error(error);
}
}
async getArtistBiography(Artist) {
try {
const html = await this.fetchData(this.generateUrl(Artist.url, 'biography'));
const $ = cheerio.load(html);
var biographyText = $('section.biography .text');
var cleanBiographyText = this.cleanText(biographyText.text());
return cleanBiographyText;
} catch (error) {
console.error(`Error scraping ${Artist.url}: ${error.message}`);
return false;
}
}
async getArtistDiscography(Artist) {
try {
const html = await this.fetchData(this.generateUrl(Artist.url, 'discography'));
const $ = cheerio.load(html);
var tableRows = $('.discography table tbody tr');
var records = [];
tableRows.each(function () {
var recordData = {
year: $(this).find('.year').text().trim(),
title: $(this).find('.title').text().trim(),
label: $(this).find('.label').text().trim(),
url: $(this).find('.title a').attr('href'),
};
var record = new Record(recordData);
records.push(record);
});
return records;
} catch (error) {
console.error(`Error scraping ${Artist.url}: ${error.message}`);
return false;
}
}
async getSingleRecordData(record) {
try {
console.log(`Getting record ${record.title}`);
const html = await this.fetchData(record.url);
const $ = cheerio.load(html);
var trackRows = $('.track-listing table tbody tr');
var recordData = {
title: record.title,
year: record.year,
label: record.label,
url: record.url,
rating: this.cleanText($('.allmusic-rating').text()),
// rating
// artist
// genre
genre: this.cleanText($('.basic-info .genre div a').text()),
// text
text: this.cleanText($('section.review .text').text()),
tracks: [],
// styles
};
trackRows.each(function () {
recordData.tracks.push($(this).find('.title a').text().trim());
});
var fullRecord = new Record(recordData);
return fullRecord;
} catch (error) {
console.error(`Error scraping ${Record.url}: ${error.message}`);
return false;
}
}
scrapeRelatedArtistData(html) {
const $ = cheerio.load(html);
const liElements = $('.related.similars ul li');
let relatedArtists = [];
liElements.each((index, element) => {
let artist = new Artist({
name: $(element).text().trim(),
url: $(element).find('a').attr('href'),
});
if (artist.name && artist.url) {
relatedArtists.push(artist);
}
});
return relatedArtists;
}
async run() {
console.log(' ');
console.log(' ');
console.log(' ');
console.log(' ');
console.log('------------------------');
var initialArtistData = {
name: 'The Abyssinians',
url: 'https://www.allmusic.com/artist/the-abyssinians-mn0000588943',
};
var InitialArtist = new Artist(initialArtistData);
var data = await this.getArtistData(InitialArtist);
if (data) {
this.artists.push(data);
}
var relatedArtists = await this.getRelatedArtists(InitialArtist);
if (relatedArtists) {
this.relatedArtists = this.relatedArtists.concat(relatedArtists);
}
//console.log(this.relatedArtists);
for (let artist of this.relatedArtists) {
var data = await this.getArtistData(artist);
if (!data) {
this.done();
}
this.artists.push(data);
}
this.done();
}
async getArtistData(Artist) {
if (!Artist || !Artist.name || !Artist.url) {
return false;
}
/* If the artist has already been fetched, return */
if (this.artistsFetched.includes(Artist.name)) {
console.log(`Artist already fetched.`);
return false;
}
/* If we've reached the maximum number of artists to fetch, return */
if (this.artistsToGet < this.artistsFetched.length) {
console.log(`Reached the limit of artists to fetch.`);
this.done();
}
this.artistsFetched.push(Artist.name);
var biography = await this.getArtistBiography(Artist);
var discography = await this.getArtistDiscography(Artist);
if (biography) {
Artist.biography = biography;
}
if (discography) {
/* Loop over all discs to get the full discography */
var records = [];
for (let record of discography) {
console.log(record);
var fullRecordData = await this.getSingleRecordData(record);
records.push(fullRecordData);
}
Artist.discography = records;
}
return Artist;
}
done() {
console.log(this.artists);
console.log(this.artists[0].discography);
this.writeToDisk();
process.exit(0); // Exit gracefully
}
removeSpecialCharacters(string) {
const noSpecialCharacters = string.replace(/[^a-zA-Z0-9– ]/g, '');
return noSpecialCharacters;
}
writeToDisk() {
fs.mkdir(this.timestampedDirectoryName, (err) => {
if (err) {
console.error('Error creating folder:', err);
} else {
console.log('Folder created successfully');
}
});
/* Generate artist bio file */
for (let artist of this.artists) {
const jsonFileName = `${artist.name} biography.txt`;
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName);
const jsonString = JSON.stringify(artist, null, 2); // Adding `null, 2` for pretty formatting
var bioText = `${artist.name} biography\n`;
bioText += artist.biography;
fs.writeFileSync(jsonFilePath, bioText);
console.log(`Data written to ${jsonFileName}`);
for (let record of artist.discography) {
var artistAndTitle = this.removeSpecialCharacters(artist.name + ' – ' + record.title);
const jsonFileName = `${artistAndTitle} review.txt`;
const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName);
const jsonString = JSON.stringify(artist, null, 2); // Adding `null, 2` for pretty formatting
var reviewText = `Review of ${record.title} by ${artist.name}\n`;
reviewText += `Artist: ${artist.name}\n`;
reviewText += `Album title: ${record.title}\n`;
reviewText += `Release year: ${record.year}\n`;
reviewText += `Label: ${record.label}\n`;
reviewText += `Genre: ${record.genre}\n`;
if (record.rating) {
reviewText += `Rating: ${record.rating} out of 10\n`;
}
reviewText += `\n\Track listing:\n`;
for (let track of record.tracks) {
reviewText += `${track}`;
reviewText += '\n';
}
if (record.text) {
reviewText += '\n';
reviewText += `Review: ${record.text}\n`;
}
fs.writeFileSync(jsonFilePath, reviewText);
console.log(`Data written to ${jsonFileName}`);
}
}
/* Generate album reviews */
}
}
const scraper = new MusicScraper();
scraper.run();