import re import json import base64 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.expected_conditions import staleness_of from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By def html2pdf( source: str, timeout: int = 2, install_driver: bool = True, print_options: dict = {}, ): result = __get_pdf_from_html(source, timeout, install_driver, print_options) return result def __send_devtools(driver, cmd, params={}): resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id url = driver.command_executor._url + resource body = json.dumps({"cmd": cmd, "params": params}) response = driver.command_executor._request("POST", url, body) if not response: raise Exception(response.get("value")) return response.get("value") def __get_pdf_from_html( path: str, timeout: int, install_driver: bool, print_options: dict ): webdriver_options = Options() webdriver_prefs = {} webdriver_options.add_argument("--headless") webdriver_options.add_argument("--disable-gpu") webdriver_options.add_argument("--no-sandbox") webdriver_options.add_argument("--disable-dev-shm-usage") webdriver_options.experimental_options["prefs"] = webdriver_prefs webdriver_prefs["profile.default_content_settings"] = {"images": 2} if install_driver: service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=webdriver_options) else: driver = webdriver.Chrome(options=webdriver_options) driver.get(path) try: WebDriverWait(driver, timeout).until( staleness_of(driver.find_element(by=By.TAG_NAME, value="html")) ) except TimeoutException: calculated_print_options = { "landscape": False, "displayHeaderFooter": False, "printBackground": True, "preferCSSPageSize": True, } calculated_print_options.update(print_options) result = __send_devtools( driver, "Page.printToPDF", calculated_print_options) driver.quit() return base64.b64decode(result["data"]) def is_valid_url(url: str) -> bool: return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))