text-analysis / scrape.py
Daryl Fung
initial commit
9b9ea2f
raw
history blame
No virus
2.85 kB
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
"""
This code uses Selenium to scrape data from a webpage.
It initializes a Chrome webdriver and loads the webpage defined by the url variable.
It then clicks a modal pop-up that appears when the webpage is loaded.
The code then enters a loop to scrape data from each page of the webpage.
It locates the data tables and extracts the table rows.
It loops through each row of the table and extracts the name, region, and description from the row.
It appends these values to the dai_values list.
The code then finds the 'Next' button and checks if it is disabled.
If the button is disabled, the loop is broken.
If the button is not disabled, the code clicks the button, scrolls to it, and waits for 2 seconds before moving on to the next page.
Finally, the code quits the webdriver.
"""
# Define the URL of the webpage to be scraped
url = 'https://www.hdrn.ca/en/inventory/'
# Initialize a Chrome webdriver
driver = webdriver.Chrome()
driver.get(url)
# Define a wait time for the driver to locate web elements
wait = WebDriverWait(driver, 2)
# Create an empty list to store the scraped data
dai_values = []
# Click the modal pop-up that appears when the webpage is loaded
driver.find_element(By.ID, 'myModal').click()
# Loop through the webpage to scrape data from each page
while True:
# Locate the data tables and extract the table rows
data_tables_scroll = driver.find_elements(By.CLASS_NAME, 'dataTables_scrollBody')[-1]
table = data_tables_scroll.find_elements(By.TAG_NAME, 'tr')
# Loop through each row of the table
for row in table:
# Extract the values from each cell of the row
row_values = row.find_elements(By.TAG_NAME, 'td')
# If the row has less than 2 cells, skip to the next row
if len(row_values) < 2:
continue
# Extract the name, region, and description from the row and append to the dai_values list
name, region, description = row_values
dai_values.append({
'name': name.text,
'region': region.text,
'description': description.text
})
# Find the 'Next' button and check if it is disabled
next_button = driver.find_elements(By.ID, 'thelist_next')
if 'disabled' in next_button[0].get_attribute('class'):
# If the button is disabled, break out of the loop
break
else:
# If the button is not disabled, click it, scroll to it, and wait for 2 seconds before moving on to the next page
driver.click()
driver.execute_script("arguments[0].scrollIntoView();", next_button[0])
time.sleep(2)
next_button[0].click()
# Quit the webdriver
driver.quit()