text-analysis / DAI scraper /scrap_assessment.py
Daryl Fung
added top 10
2a000a7
raw
history blame contribute delete
No virus
3.91 kB
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import csv
# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
driver = webdriver.Chrome()
# Open the webpage
mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"
jurisdictions = {
'mchp': mchp,
'bc': bc,
'ab': ab,
'sk': sk,
'ices': ices,
'nb': nb,
'hdns': hdns,
'nlchi': nlchi,
'cihi': cihi
}
dataset_assessments = []
for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
driver.get(jurisdiction)
while True:
try:
# Wait for the page to load after login (adjust the timeout as needed)
WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))
title = driver.find_element(By.CLASS_NAME, 'panel-title').text
dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
dataset_dict = {'dataset': dataset}
# Find the table element with class "table"
table = driver.find_element(By.CLASS_NAME, "table")
# Find the tbody element within the table
tbody = table.find_element(By.TAG_NAME, "tbody")
# Find the first tr element within the tbody
first_tr = tbody.find_element(By.TAG_NAME, "tr")
# Extract the text or perform any other desired actions with the first tr block
tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion
rationale = ""
discussion = ""
if len(tr) == 6:
rationale = tr[3].text
elif len(tr) == 8:
rationale = tr[3].text
discussion = tr[5].text
dataset_dict['rationale'] = rationale
dataset_dict['discussion'] = discussion
dataset_assessments.append(dataset_dict)
next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
if len(next_button) == 0:
break
next_button[0].click()
except:
# If the table element is not found, perform login
# Find the login form elements (e.g., username and password inputs)
username_input = driver.find_element('name', 'username')
password_input = driver.find_element('name', 'password')
# Fill in the login credentials
username_input.send_keys("dfung") # Replace with your username
password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password
# Submit the login form
password_input.send_keys(Keys.RETURN)
# Define the CSV file path
csv_file = f'{jurisdiction_name}_assessment.csv'
# Extract the column names from the first dictionary
header = list(dataset_assessments[0].keys())
# Open the CSV file in write mode
with open(csv_file, mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=header)
# Write the header row
writer.writeheader()
# Write the data rows
for row in dataset_assessments:
writer.writerow(row)