瀏覽代碼

WebscraperTool bypass cloudflare site by cloudscraper (#6337)

tags/0.6.15
Weaxs 1 年之前
父節點
當前提交
4e2fba404d
沒有連結到貢獻者的電子郵件帳戶。
共有 3 個檔案被更改,包括 57 行新增20 行删除
  1. 25
    19
      api/core/tools/utils/web_reader_tool.py
  2. 31
    1
      api/poetry.lock
  3. 1
    0
      api/pyproject.toml

+ 25
- 19
api/core/tools/utils/web_reader_tool.py 查看文件

from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote


import cloudscraper
import requests import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article from newspaper import Article
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))


if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
if response.status_code == 200:
# check content-type
content_type = response.headers.get('Content-Type')
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition', '')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]


# check content-type
content_type = response.headers.get('Content-Type')
if content_type:
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
else:
content_disposition = response.headers.get('Content-Disposition', '')
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = unquote(filename_match.group(1))
extension = re.search(r'\.(\w+)$', filename)
if extension:
main_content_type = mimetypes.guess_type(filename)[0]
if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type)


if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type)
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)


if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))

if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)


response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
a = extract_using_readabilipy(response.text) a = extract_using_readabilipy(response.text)


if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():

+ 31
- 1
api/poetry.lock 查看文件

numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"] numpy = ["numpy (>=1.12.0)", "pandas (>=0.24.0)"]
zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"] zstd = ["clickhouse-cityhash (>=1.0.2.1)", "zstd"]


[[package]]
name = "cloudscraper"
version = "1.2.71"
description = "A Python module to bypass Cloudflare's anti-bot page."
optional = false
python-versions = "*"
files = [
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
]

[package.dependencies]
pyparsing = ">=2.4.7"
requests = ">=2.9.2"
requests-toolbelt = ">=0.9.1"

[[package]] [[package]]
name = "cohere" name = "cohere"
version = "5.2.6" version = "5.2.6"
[package.extras] [package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"] rsa = ["oauthlib[signedtoken] (>=3.0.0)"]


[[package]]
name = "requests-toolbelt"
version = "1.0.0"
description = "A utility belt for advanced users of python-requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
{file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
]

[package.dependencies]
requests = ">=2.0.1,<3.0.0"

[[package]] [[package]]
name = "resend" name = "resend"
version = "0.7.2" version = "0.7.2"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "8d2a12543340f6f4fa6dcb27f93d8b3f5380e7a3e7eb5e399e76e6b8588b4611"
content-hash = "9b1821b6e5d6d44947cc011c2d635a366557582b4540b99e0ff53a3078a989e5"

+ 1
- 0
api/pyproject.toml 查看文件

vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] } vanna = { version = "0.5.5", extras = ["postgres", "mysql", "clickhouse", "duckdb"] }
wikipedia = "1.4.0" wikipedia = "1.4.0"
yfinance = "~0.2.40" yfinance = "~0.2.40"
cloudscraper = "1.2.71"


############################################################ ############################################################
# VDB dependencies required by vector store clients # VDB dependencies required by vector store clients

Loading…
取消
儲存