1 rok temu · 79cb23e8ac
--- a/api/core/helper/ssrf_proxy.py
+++ b/api/core/helper/ssrf_proxy.py
@@ -17,12 +17,15 @@ proxies = {
    'https://': SSRF_PROXY_HTTPS_URL
 } if SSRF_PROXY_HTTP_URL and SSRF_PROXY_HTTPS_URL else None


 BACKOFF_FACTOR = 0.5
 STATUS_FORCELIST = [429, 500, 502, 503, 504]


 def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
    if "allow_redirects" in kwargs:
        allow_redirects = kwargs.pop("allow_redirects")
        if "follow_redirects" not in kwargs:
            kwargs["follow_redirects"] = allow_redirects
    
    retries = 0
    while retries <= max_retries:
        try:
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -4,9 +4,8 @@ from pathlib import Path
 from typing import Union
 from urllib.parse import unquote

 import requests

 from configs import dify_config
 from core.helper import ssrf_proxy
 from core.rag.extractor.csv_extractor import CSVExtractor
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
@@ -51,7 +50,7 @@ class ExtractProcessor:

    @classmethod
    def load_from_url(cls, url: str, return_text: bool = False) -> Union[list[Document], str]:
        response = requests.get(url, headers={
        response = ssrf_proxy.get(url, headers={
            "User-Agent": USER_AGENT
        })

--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -11,11 +11,10 @@ from contextlib import contextmanager
 from urllib.parse import unquote

 import cloudscraper
 import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
 from newspaper import Article
 from regex import regex

 from core.helper import ssrf_proxy
 from core.rag.extractor import extract_processor
 from core.rag.extractor.extract_processor import ExtractProcessor

@@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:

    main_content_type = None
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
    response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))

    if response.status_code == 200:
        # check content-type
@@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
            return ExtractProcessor.load_from_url(url, return_text=True)

        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
        response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
    elif response.status_code == 403:
        scraper = cloudscraper.create_scraper()
        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
        scraper.perform_request = ssrf_proxy.make_request
        response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))

    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)
@@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
    a = extract_using_readabilipy(response.text)

    if not a['plain_text'] or not a['plain_text'].strip():
        return get_url_from_newspaper3k(url)
        return ''

    res = FULL_TEMPLATE.format(
        title=a['title'],
@@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
    return res


 def get_url_from_newspaper3k(url: str) -> str:

    a = Article(url)
    a.download()
    a.parse()

    res = FULL_TEMPLATE.format(
        title=a.title,
        authors=a.authors,
        publish_date=a.publish_date,
        top_image=a.top_image,
        text=a.text,
    )

    return res


 def extract_using_readabilipy(html):
    with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
        f_html.write(html)