Spaces:

darylfunggg
/

text-analysis

Running

App Files Files Community

text-analysis / DAI scraper /scrap_assessment.py

Daryl Fung

added top 10

2a000a7 over 1 year ago

raw

history blame contribute delete

No virus

3.91 kB

	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.ui import Select
	import csv

	# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
	driver = webdriver.Chrome()

	# Open the webpage

	mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
	bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
	ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
	sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
	ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
	nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
	hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
	nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
	cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"

	jurisdictions = {
	'mchp': mchp,
	'bc': bc,
	'ab': ab,
	'sk': sk,
	'ices': ices,
	'nb': nb,
	'hdns': hdns,
	'nlchi': nlchi,
	'cihi': cihi
	}

	dataset_assessments = []

	for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
	driver.get(jurisdiction)
	while True:
	try:
	# Wait for the page to load after login (adjust the timeout as needed)
	WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))

	title = driver.find_element(By.CLASS_NAME, 'panel-title').text
	dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
	dataset_dict = {'dataset': dataset}

	# Find the table element with class "table"
	table = driver.find_element(By.CLASS_NAME, "table")

	# Find the tbody element within the table
	tbody = table.find_element(By.TAG_NAME, "tbody")

	# Find the first tr element within the tbody
	first_tr = tbody.find_element(By.TAG_NAME, "tr")

	# Extract the text or perform any other desired actions with the first tr block
	tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion

	rationale = ""
	discussion = ""
	if len(tr) == 6:
	rationale = tr[3].text
	elif len(tr) == 8:
	rationale = tr[3].text
	discussion = tr[5].text

	dataset_dict['rationale'] = rationale
	dataset_dict['discussion'] = discussion
	dataset_assessments.append(dataset_dict)

	next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
	if len(next_button) == 0:
	break
	next_button[0].click()

	except:
	# If the table element is not found, perform login

	# Find the login form elements (e.g., username and password inputs)
	username_input = driver.find_element('name', 'username')
	password_input = driver.find_element('name', 'password')

	# Fill in the login credentials
	username_input.send_keys("dfung") # Replace with your username
	password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password

	# Submit the login form
	password_input.send_keys(Keys.RETURN)


	# Define the CSV file path
	csv_file = f'{jurisdiction_name}_assessment.csv'

	# Extract the column names from the first dictionary
	header = list(dataset_assessments[0].keys())

	# Open the CSV file in write mode
	with open(csv_file, mode='w', newline='') as file:
	writer = csv.DictWriter(file, fieldnames=header)

	# Write the header row
	writer.writeheader()

	# Write the data rows
	for row in dataset_assessments:
	writer.writerow(row)