|
|
|
@@ -10,6 +10,7 @@ import unicodedata |
|
|
|
from contextlib import contextmanager |
|
|
|
from urllib.parse import unquote |
|
|
|
|
|
|
|
import cloudscraper |
|
|
|
import requests |
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString |
|
|
|
from newspaper import Article |
|
|
|
@@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"] |
|
|
|
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10)) |
|
|
|
|
|
|
|
if response.status_code != 200: |
|
|
|
return "URL returned status code {}.".format(response.status_code) |
|
|
|
if response.status_code == 200: |
|
|
|
# check content-type |
|
|
|
content_type = response.headers.get('Content-Type') |
|
|
|
if content_type: |
|
|
|
main_content_type = response.headers.get('Content-Type').split(';')[0].strip() |
|
|
|
else: |
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
|
|
filename_match = re.search(r'filename="([^"]+)"', content_disposition) |
|
|
|
if filename_match: |
|
|
|
filename = unquote(filename_match.group(1)) |
|
|
|
extension = re.search(r'\.(\w+)$', filename) |
|
|
|
if extension: |
|
|
|
main_content_type = mimetypes.guess_type(filename)[0] |
|
|
|
|
|
|
|
# check content-type |
|
|
|
content_type = response.headers.get('Content-Type') |
|
|
|
if content_type: |
|
|
|
main_content_type = response.headers.get('Content-Type').split(';')[0].strip() |
|
|
|
else: |
|
|
|
content_disposition = response.headers.get('Content-Disposition', '') |
|
|
|
filename_match = re.search(r'filename="([^"]+)"', content_disposition) |
|
|
|
if filename_match: |
|
|
|
filename = unquote(filename_match.group(1)) |
|
|
|
extension = re.search(r'\.(\w+)$', filename) |
|
|
|
if extension: |
|
|
|
main_content_type = mimetypes.guess_type(filename)[0] |
|
|
|
if main_content_type not in supported_content_types: |
|
|
|
return "Unsupported content-type [{}] of URL.".format(main_content_type) |
|
|
|
|
|
|
|
if main_content_type not in supported_content_types: |
|
|
|
return "Unsupported content-type [{}] of URL.".format(main_content_type) |
|
|
|
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: |
|
|
|
return ExtractProcessor.load_from_url(url, return_text=True) |
|
|
|
|
|
|
|
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES: |
|
|
|
return ExtractProcessor.load_from_url(url, return_text=True) |
|
|
|
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) |
|
|
|
elif response.status_code == 403: |
|
|
|
scraper = cloudscraper.create_scraper() |
|
|
|
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) |
|
|
|
|
|
|
|
if response.status_code != 200: |
|
|
|
return "URL returned status code {}.".format(response.status_code) |
|
|
|
|
|
|
|
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300)) |
|
|
|
a = extract_using_readabilipy(response.text) |
|
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip(): |