|
|
|
@@ -11,11 +11,10 @@ from contextlib import contextmanager |
|
|
|
from urllib.parse import unquote |
|
|
|
|
|
|
|
import cloudscraper |
|
|
|
import requests |
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString |
|
|
|
from newspaper import Article |
|
|
|
from regex import regex |
|
|
|
|
|
|
|
from core.helper import ssrf_proxy |
|
|
|
from core.rag.extractor import extract_processor |
|
|
|
from core.rag.extractor.extract_processor import ExtractProcessor |
|
|
|
|
|
|
|
@@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
|
|
|
|
main_content_type = None |
|
|
|
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] |
|
|
|
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) |
|
|
|
response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10)) |
|
|
|
|
|
|
|
if response.status_code == 200: |
|
|
|
# check content-type |
|
|
|
@@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: |
|
|
|
return ExtractProcessor.load_from_url(url, return_text=True) |
|
|
|
|
|
|
|
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) |
|
|
|
response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300)) |
|
|
|
elif response.status_code == 403: |
|
|
|
scraper = cloudscraper.create_scraper() |
|
|
|
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) |
|
|
|
scraper.perform_request = ssrf_proxy.make_request |
|
|
|
response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300)) |
|
|
|
|
|
|
|
if response.status_code != 200: |
|
|
|
return "URL returned status code {}.".format(response.status_code) |
|
|
|
@@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
a = extract_using_readabilipy(response.text) |
|
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip(): |
|
|
|
return get_url_from_newspaper3k(url) |
|
|
|
return '' |
|
|
|
|
|
|
|
res = FULL_TEMPLATE.format( |
|
|
|
title=a['title'], |
|
|
|
@@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
return res |
|
|
|
|
|
|
|
|
|
|
|
def get_url_from_newspaper3k(url: str) -> str: |
|
|
|
|
|
|
|
a = Article(url) |
|
|
|
a.download() |
|
|
|
a.parse() |
|
|
|
|
|
|
|
res = FULL_TEMPLATE.format( |
|
|
|
title=a.title, |
|
|
|
authors=a.authors, |
|
|
|
publish_date=a.publish_date, |
|
|
|
top_image=a.top_image, |
|
|
|
text=a.text, |
|
|
|
) |
|
|
|
|
|
|
|
return res |
|
|
|
|
|
|
|
|
|
|
|
def extract_using_readabilipy(html): |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html: |
|
|
|
f_html.write(html) |