const axios = require('axios'); const cheerio = require('cheerio'); const fs = require('fs'); const path = require('path'); class MusicScraper { constructor() { this.relatedFetched = 0; this.relatedNumberToGet = 50; this.urls = []; this.data = []; this.fetchedUrls = []; /* Create a timestamped folder name */ this.timestampedDirectoryName = this.getTimestampedDirectoryName(); process.on('SIGINT', () => { // Run your cleanup or other desired method before exiting this.done(); console.log('Exiting the script...'); process.exit(0); // Exit gracefully }); } async fetchData(url) { console.log(`Fetching ${url}`); try { const response = await axios.get(url); return response.data; } catch (error) { throw new Error(`Error fetching data: ${error.message}`); } } scrapeRelatedData(html) { const $ = cheerio.load(html); const liElements = $('.related.similars ul li'); liElements.each((index, element) => { const link = $(element).find('a').attr('href'); const relatedLink = $(element).find('a').attr('href') + '/related'; const biographyLink = $(element).find('a').attr('href') + '/biography'; const discographyLink = $(element).find('a').attr('href') + '/discography'; const text = $(element).text().trim(); if (link && text) { this.data.push({ link, text, relatedLink, biographyLink, discographyLink, discographies: [] }); } }); } async scrapeUrls() { for (const url of this.urls) { try { this.relatedFetched++; if (this.relatedNumberToGet > this.relatedFetched) { const html = await this.fetchData(url); this.scrapeRelatedData(html); } } catch (error) { console.error(`Error scraping ${url}: ${error.message}`); } } } cleanText(text) { // Replace consecutive spaces and newlines with a single space var newText = text.replace(/\s+/g, ' ').trim(); newText = newText.replace(/[\\"]/g, ''); return newText; } async scrapeBiographies() { /* Loop over all biography URLs, fetch the data, and add it to the object */ let index = 0; for (const item of this.data) { console.log('Scraping biography for ' + item.text); try { const html = await this.fetchData(item.biographyLink); const $ = cheerio.load(html); var biographyText = $('section.biography .text'); this.data[index].biography = this.cleanText(biographyText.text()); //console.log(biographyText.text()); index++; } catch (error) { console.error(`Error scraping ${item.biographyLink}: ${error.message}`); index++; } } } async scrapeDiscographies() { /* Loop over all discography URLs, fetch the data, and add it to the object */ let index = 0; for (const item of this.data) { console.log('Scraping discographies for ' + item.text); try { const html = await this.fetchData(item.discographyLink); const $ = cheerio.load(html); var rows = $('.discography').find('tr'); console.log(rows.text()); if (rows) { rows.each(() => { var release = { //title: $(element).find('.title').data('sort-title'), year: $(this).find('.year') ? $(element).find('.year').text() : '', }; this.data[index].discographies.push(release); }); } index++; } catch (error) { console.error(`Error scraping ${item.discographyLink}: ${error.message}`); index++; } } } async run() { try { const initialUrl = 'https://www.allmusic.com/artist/johnny-osbourne-mn0000248916/related'; const initialHtml = await this.fetchData(initialUrl); this.scrapeRelatedData(initialHtml); // Extract URLs from the scraped data this.urls = this.data.map((item) => item.relatedLink); await this.scrapeUrls(); await this.scrapeBiographies(); //await this.scrapeDiscographies(); this.done(); } catch (error) { console.error(error); } } getTimestampedDirectoryName() { const now = new Date(); const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); return `${timestamp}_data`; } getTimestampedFileName() { const now = new Date(); const timestamp = now.toISOString().replace(/:/g, '-').replace(/\..+/, ''); return `${timestamp}_data.txt`; } writeDataToDisk() { const jsonFileName = this.getTimestampedFileName(); const jsonFilePath = path.join(__dirname, jsonFileName); const jsonData = JSON.stringify(this.data, null, 2); // Pretty-print JSON fs.writeFileSync(jsonFilePath, jsonData); console.log(`Data written to ${jsonFileName}`); } writeBiographyDataToDisk() { fs.mkdir(this.timestampedDirectoryName, (err) => { if (err) { console.error('Error creating folder:', err); } else { console.log('Folder created successfully'); } }); const jsonFileName = item.text + '.txt'; const jsonFilePath = path.join(__dirname + '/' + this.timestampedDirectoryName, jsonFileName); const jsonString = JSON.stringify(jsonObject, null, 2); fs.writeFileSync(jsonFilePath, jsonString); console.log(`Data written to ${jsonFileName}`); } done() { //this.writeDataToDisk(); console.log(this.data); this.writeBiographyDataToDisk(); process.exit(); } } const scraper = new MusicScraper(); scraper.run();