Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

web_utils.py 2.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import re
  2. import json
  3. import base64
  4. from selenium import webdriver
  5. from selenium.webdriver.chrome.options import Options
  6. from selenium.webdriver.chrome.service import Service
  7. from selenium.common.exceptions import TimeoutException
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.support.expected_conditions import staleness_of
  10. from webdriver_manager.chrome import ChromeDriverManager
  11. from selenium.webdriver.common.by import By
  12. def html2pdf(
  13. source: str,
  14. timeout: int = 2,
  15. install_driver: bool = True,
  16. print_options: dict = {},
  17. ):
  18. result = __get_pdf_from_html(source, timeout, install_driver, print_options)
  19. return result
  20. def __send_devtools(driver, cmd, params={}):
  21. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  22. url = driver.command_executor._url + resource
  23. body = json.dumps({"cmd": cmd, "params": params})
  24. response = driver.command_executor._request("POST", url, body)
  25. if not response:
  26. raise Exception(response.get("value"))
  27. return response.get("value")
  28. def __get_pdf_from_html(
  29. path: str,
  30. timeout: int,
  31. install_driver: bool,
  32. print_options: dict
  33. ):
  34. webdriver_options = Options()
  35. webdriver_prefs = {}
  36. webdriver_options.add_argument("--headless")
  37. webdriver_options.add_argument("--disable-gpu")
  38. webdriver_options.add_argument("--no-sandbox")
  39. webdriver_options.add_argument("--disable-dev-shm-usage")
  40. webdriver_options.experimental_options["prefs"] = webdriver_prefs
  41. webdriver_prefs["profile.default_content_settings"] = {"images": 2}
  42. if install_driver:
  43. service = Service(ChromeDriverManager().install())
  44. driver = webdriver.Chrome(service=service, options=webdriver_options)
  45. else:
  46. driver = webdriver.Chrome(options=webdriver_options)
  47. driver.get(path)
  48. try:
  49. WebDriverWait(driver, timeout).until(
  50. staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
  51. )
  52. except TimeoutException:
  53. calculated_print_options = {
  54. "landscape": False,
  55. "displayHeaderFooter": False,
  56. "printBackground": True,
  57. "preferCSSPageSize": True,
  58. }
  59. calculated_print_options.update(print_options)
  60. result = __send_devtools(
  61. driver, "Page.printToPDF", calculated_print_options)
  62. driver.quit()
  63. return base64.b64decode(result["data"])
  64. def is_valid_url(url: str) -> bool:
  65. return bool(re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))