浏览代码

fix: tool webscraper - too many redirects in case target url does not… (#3831)

Co-authored-by: miendinh <miendinh@users.noreply.github.com>
tags/0.6.6
miendinh 1年前
父节点
当前提交
2a8881d0e8
没有帐户链接到提交者的电子邮件
共有 1 个文件被更改,包括 4 次插入5 次删除
  1. 4
    5
      api/core/tools/utils/web_reader_tool.py

+ 4
- 5
api/core/tools/utils/web_reader_tool.py 查看文件

@@ -42,20 +42,19 @@ def get_url(url: str, user_agent: str = None) -> str:
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]

head_response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 10))

if head_response.status_code != 200:
return "URL returned status code {}.".format(head_response.status_code)
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)

# check content-type
main_content_type = head_response.headers.get('Content-Type').split(';')[0].strip()
main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
if main_content_type not in supported_content_types:
return "Unsupported content-type [{}] of URL.".format(main_content_type)

if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)

response = requests.get(url, headers=headers, allow_redirects=True, timeout=(5, 30))
a = extract_using_readabilipy(response.text)

if not a['plain_text'] or not a['plain_text'].strip():

正在加载...
取消
保存