|
|
|
@@ -10,6 +10,7 @@ import unicodedata |
|
|
|
from contextlib import contextmanager |
|
|
|
from urllib.parse import unquote |
|
|
|
|
|
|
|
import chardet |
|
|
|
import cloudscraper |
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString |
|
|
|
from regex import regex |
|
|
|
@@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str: |
|
|
|
if response.status_code != 200: |
|
|
|
return "URL returned status code {}.".format(response.status_code) |
|
|
|
|
|
|
|
a = extract_using_readabilipy(response.text) |
|
|
|
# Detect encoding using chardet |
|
|
|
detected_encoding = chardet.detect(response.content) |
|
|
|
encoding = detected_encoding['encoding'] |
|
|
|
if encoding: |
|
|
|
try: |
|
|
|
content = response.content.decode(encoding) |
|
|
|
except (UnicodeDecodeError, TypeError): |
|
|
|
content = response.text |
|
|
|
else: |
|
|
|
content = response.text |
|
|
|
|
|
|
|
a = extract_using_readabilipy(content) |
|
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip(): |
|
|
|
return '' |