const axios = require('axios'); const cheerio = require('cheerio'); const fs = require('fs'); const path = require('path'); class Artist { constructor(data) { this.name = ''; this.url = ''; this.biography = ''; this.discography = []; if (data) { for (const prop in data) { if (data.hasOwnProperty(prop)) { this[prop] = data[prop]; } } } } } class Record { constructor(data) { this.artist = ''; this.title = ''; this.label = ''; this.url = ''; this.rating = 0; this.year = ''; this.genre = ''; this.text = ''; this.styles = []; this.tracks = []; if (data) { for (const prop in data) { if (data.hasOwnProperty(prop)) { this[prop] = data[prop]; } } } } } class MusicScraper { constructor() { this.artistsToGet = 5; this.artistsFetched = []; this.relatedArtists = []; this.artists = []; /* Create a timestamped folder name */ this.timestampedDirectoryName = this.getTimestampedDirectoryName(); process.on('SIGINT', () => { // Run your cleanup or other desired method before exiting console.log('Exiting the script...'); this.done(); process.exit(0); // Exit gracefully }); } getTimestampedDirectoryName() { const now = new Date(); const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); return `${timestamp}_data`; } async fetchData(url) { if (!url) { console.log(`Can't fetch URL since it's null`); return false; } try { const response = await axios.get(url); return response.data; } catch (error) { throw new Error(`Error fetching data: ${error.message}`); } } generateUrl(baseUrl, type) { return `${baseUrl}/${type}`; } cleanText(text) { // Replace consecutive spaces and newlines with a single space var newText = text.replace(/\s+/g, ' ').trim(); newText = newText.replace(/[\\"]/g, ''); return newText; } async getRelatedArtists(Artist) { try { const html = await this.fetchData(this.generateUrl(Artist.url, 'related')); var relatedArtists = this.scrapeRelatedArtistData(html); return relatedArtists; } catch (error) { console.error(error); } } async getArtistBiography(Artist) { try { const html = await this.fetchData(this.generateUrl(Artist.url, 'biography')); const $ = cheerio.load(html); var biographyText = $('section.biography .text'); var cleanBiographyText = this.cleanText(biographyText.text()); return cleanBiographyText; } catch (error) { console.error(`Error scraping ${Artist.url}: ${error.message}`); return false; } } async getArtistDiscography(Artist) { try { const html = await this.fetchData(this.generateUrl(Artist.url, 'discography')); const $ = cheerio.load(html); var tableRows = $('.discography table tbody tr'); var records = []; tableRows.each(function () { var recordData = { year: $(this).find('.year').text().trim(), title: $(this).find('.title').text().trim(), label: $(this).find('.label').text().trim(), url: $(this).find('.title a').attr('href'), }; var record = new Record(recordData); records.push(record); }); return records; } catch (error) { console.error(`Error scraping ${Artist.url}: ${error.message}`); return false; } } async getSingleRecordData(record) { try { console.log(`Getting record ${record.title}`); const html = await this.fetchData(record.url); const $ = cheerio.load(html); var trackRows = $('.track-listing table tbody tr'); var recordData = { title: record.title, year: record.year, label: record.label, url: record.url, rating: this.cleanText($('.allmusic-rating').text()), // rating // artist // genre genre: this.cleanText($('.basic-info .genre div a').text()), // text text: this.cleanText($('section.review .text').text()), tracks: [], // styles }; trackRows.each(function () { recordData.tracks.push($(this).find('.title a').text().trim()); }); var fullRecord = new Record(recordData); return fullRecord; } catch (error) { console.error(`Error scraping ${Record.url}: ${error.message}`); return false; } } scrapeRelatedArtistData(html) { const $ = cheerio.load(html); const liElements = $('.related.similars ul li'); let relatedArtists = []; liElements.each((index, element) => { let artist = new Artist({ name: $(element).text().trim(), url: $(element).find('a').attr('href'), }); if (artist.name && artist.url) { relatedArtists.push(artist); } }); return relatedArtists; } async run() { console.log(' '); console.log(' '); console.log(' '); console.log(' '); console.log('------------------------'); var initialArtistData = { name: 'The Abyssinians', url: 'https://www.allmusic.com/artist/the-abyssinians-mn0000588943', }; var InitialArtist = new Artist(initialArtistData); var data = await this.getArtistData(InitialArtist); if (data) { this.artists.push(data); } var relatedArtists = await this.getRelatedArtists(InitialArtist); if (relatedArtists) { this.relatedArtists = this.relatedArtists.concat(relatedArtists); } //console.log(this.relatedArtists); for (let artist of this.relatedArtists) { var data = await this.getArtistData(artist); if (!data) { this.done(); } this.artists.push(data); } this.done(); } async getArtistData(Artist) { if (!Artist || !Artist.name || !Artist.url) { return false; } /* If the artist has already been fetched, return */ if (this.artistsFetched.includes(Artist.name)) { console.log(`Artist already fetched.`); return false; } /* If we've reached the maximum number of artists to fetch, return */ if (this.artistsToGet < this.artistsFetched.length) { console.log(`Reached the limit of artists to fetch.`); this.done(); } this.artistsFetched.push(Artist.name); var biography = await this.getArtistBiography(Artist); var discography = await this.getArtistDiscography(Artist); if (biography) { Artist.biography = biography; } if (discography) { /* Loop over all discs to get the full discography */ var records = []; for (let record of discography) { console.log(record); var fullRecordData = await this.getSingleRecordData(record); records.push(fullRecordData); } Artist.discography = records; } return Artist; } done() { console.log(this.artists); console.log(this.artists[0].discography); this.writeToDisk(); process.exit(0); // Exit gracefully } removeSpecialCharacters(string) { const noSpecialCharacters = string.replace(/[^a-zA-Z0-9– ]/g, ''); return noSpecialCharacters; } writeToDisk() { fs.mkdir(this.timestampedDirectoryName, (err) => { if (err) { console.error('Error creating folder:', err); } else { console.log('Folder created successfully'); } }); /* Generate artist bio file */ for (let artist of this.artists) { const jsonFileName = `${artist.name} biography.txt`; const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); const jsonString = JSON.stringify(artist, null, 2); // Adding `null, 2` for pretty formatting var bioText = `${artist.name} biography\n`; bioText += artist.biography; fs.writeFileSync(jsonFilePath, bioText); console.log(`Data written to ${jsonFileName}`); for (let record of artist.discography) { var artistAndTitle = this.removeSpecialCharacters(artist.name + ' – ' + record.title); const jsonFileName = `${artistAndTitle} review.txt`; const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); const jsonString = JSON.stringify(artist, null, 2); // Adding `null, 2` for pretty formatting var reviewText = `Review of ${record.title} by ${artist.name}\n`; reviewText += `Artist: ${artist.name}\n`; reviewText += `Album title: ${record.title}\n`; reviewText += `Release year: ${record.year}\n`; reviewText += `Label: ${record.label}\n`; reviewText += `Genre: ${record.genre}\n`; if (record.rating) { reviewText += `Rating: ${record.rating} out of 10\n`; } reviewText += `\n\Track listing:\n`; for (let track of record.tracks) { reviewText += `${track}`; reviewText += '\n'; } if (record.text) { reviewText += '\n'; reviewText += `Review: ${record.text}\n`; } fs.writeFileSync(jsonFilePath, reviewText); console.log(`Data written to ${jsonFileName}`); } } /* Generate album reviews */ } } const scraper = new MusicScraper(); scraper.run();