選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

web_utils.py 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import ipaddress
  18. import json
  19. import re
  20. import socket
  21. from urllib.parse import urlparse
  22. from selenium import webdriver
  23. from selenium.common.exceptions import TimeoutException
  24. from selenium.webdriver.chrome.options import Options
  25. from selenium.webdriver.chrome.service import Service
  26. from selenium.webdriver.common.by import By
  27. from selenium.webdriver.support.expected_conditions import staleness_of
  28. from selenium.webdriver.support.ui import WebDriverWait
  29. from webdriver_manager.chrome import ChromeDriverManager
  30. CONTENT_TYPE_MAP = {
  31. # Office
  32. "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  33. "doc": "application/msword",
  34. "pdf": "application/pdf",
  35. "csv": "text/csv",
  36. "xls": "application/vnd.ms-excel",
  37. "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  38. # Text/code
  39. "txt": "text/plain",
  40. "py": "text/plain",
  41. "js": "text/plain",
  42. "java": "text/plain",
  43. "c": "text/plain",
  44. "cpp": "text/plain",
  45. "h": "text/plain",
  46. "php": "text/plain",
  47. "go": "text/plain",
  48. "ts": "text/plain",
  49. "sh": "text/plain",
  50. "cs": "text/plain",
  51. "kt": "text/plain",
  52. "sql": "text/plain",
  53. # Web
  54. "md": "text/markdown",
  55. "markdown": "text/markdown",
  56. "htm": "text/html",
  57. "html": "text/html",
  58. "json": "application/json",
  59. # Image formats
  60. "png": "image/png",
  61. "jpg": "image/jpeg",
  62. "jpeg": "image/jpeg",
  63. "gif": "image/gif",
  64. "bmp": "image/bmp",
  65. "tiff": "image/tiff",
  66. "tif": "image/tiff",
  67. "webp": "image/webp",
  68. "svg": "image/svg+xml",
  69. "ico": "image/x-icon",
  70. "avif": "image/avif",
  71. "heic": "image/heic",
  72. }
  73. def html2pdf(
  74. source: str,
  75. timeout: int = 2,
  76. install_driver: bool = True,
  77. print_options: dict = {},
  78. ):
  79. result = __get_pdf_from_html(source, timeout, install_driver, print_options)
  80. return result
  81. def __send_devtools(driver, cmd, params={}):
  82. resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
  83. url = driver.command_executor._url + resource
  84. body = json.dumps({"cmd": cmd, "params": params})
  85. response = driver.command_executor._request("POST", url, body)
  86. if not response:
  87. raise Exception(response.get("value"))
  88. return response.get("value")
  89. def __get_pdf_from_html(path: str, timeout: int, install_driver: bool, print_options: dict):
  90. webdriver_options = Options()
  91. webdriver_prefs = {}
  92. webdriver_options.add_argument("--headless")
  93. webdriver_options.add_argument("--disable-gpu")
  94. webdriver_options.add_argument("--no-sandbox")
  95. webdriver_options.add_argument("--disable-dev-shm-usage")
  96. webdriver_options.experimental_options["prefs"] = webdriver_prefs
  97. webdriver_prefs["profile.default_content_settings"] = {"images": 2}
  98. if install_driver:
  99. service = Service(ChromeDriverManager().install())
  100. driver = webdriver.Chrome(service=service, options=webdriver_options)
  101. else:
  102. driver = webdriver.Chrome(options=webdriver_options)
  103. driver.get(path)
  104. try:
  105. WebDriverWait(driver, timeout).until(staleness_of(driver.find_element(by=By.TAG_NAME, value="html")))
  106. except TimeoutException:
  107. calculated_print_options = {
  108. "landscape": False,
  109. "displayHeaderFooter": False,
  110. "printBackground": True,
  111. "preferCSSPageSize": True,
  112. }
  113. calculated_print_options.update(print_options)
  114. result = __send_devtools(driver, "Page.printToPDF", calculated_print_options)
  115. driver.quit()
  116. return base64.b64decode(result["data"])
  117. def is_private_ip(ip: str) -> bool:
  118. try:
  119. ip_obj = ipaddress.ip_address(ip)
  120. return ip_obj.is_private
  121. except ValueError:
  122. return False
  123. def is_valid_url(url: str) -> bool:
  124. if not re.match(r"(https?)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url):
  125. return False
  126. parsed_url = urlparse(url)
  127. hostname = parsed_url.hostname
  128. if not hostname:
  129. return False
  130. try:
  131. ip = socket.gethostbyname(hostname)
  132. if is_private_ip(ip):
  133. return False
  134. except socket.gaierror:
  135. return False
  136. return True
  137. def safe_json_parse(data: str | dict) -> dict:
  138. if isinstance(data, dict):
  139. return data
  140. try:
  141. return json.loads(data) if data else {}
  142. except (json.JSONDecodeError, TypeError):
  143. return {}
  144. def get_float(req: dict, key: str, default: float | int = 10.0) -> float:
  145. try:
  146. parsed = float(req.get(key, default))
  147. return parsed if parsed > 0 else default
  148. except (TypeError, ValueError):
  149. return default