| """Abstract interface for document loader implementations.""" | """Abstract interface for document loader implementations.""" | ||||
| import datetime | import datetime | ||||
| import logging | |||||
| import mimetypes | import mimetypes | ||||
| import os | import os | ||||
| import re | |||||
| import tempfile | import tempfile | ||||
| import uuid | import uuid | ||||
| import xml.etree.ElementTree as ET | |||||
| from urllib.parse import urlparse | from urllib.parse import urlparse | ||||
| import requests | import requests | ||||
| from extensions.ext_storage import storage | from extensions.ext_storage import storage | ||||
| from models.model import UploadFile | from models.model import UploadFile | ||||
| logger = logging.getLogger(__name__) | |||||
| class WordExtractor(BaseExtractor): | class WordExtractor(BaseExtractor): | ||||
| """Load docx files. | """Load docx files. | ||||
| image_map = self._extract_images_from_docx(doc, image_folder) | image_map = self._extract_images_from_docx(doc, image_folder) | ||||
| hyperlinks_url = None | |||||
| url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+') | |||||
| for para in doc.paragraphs: | |||||
| for run in para.runs: | |||||
| if run.text and hyperlinks_url: | |||||
| result = f' [{run.text}]({hyperlinks_url}) ' | |||||
| run.text = result | |||||
| hyperlinks_url = None | |||||
| if 'HYPERLINK' in run.element.xml: | |||||
| try: | |||||
| xml = ET.XML(run.element.xml) | |||||
| x_child = [c for c in xml.iter() if c is not None] | |||||
| for x in x_child: | |||||
| if x_child is None: | |||||
| continue | |||||
| if x.tag.endswith('instrText'): | |||||
| for i in url_pattern.findall(x.text): | |||||
| hyperlinks_url = str(i) | |||||
| except Exception as e: | |||||
| logger.error(e) | |||||
| def parse_paragraph(paragraph): | def parse_paragraph(paragraph): | ||||
| paragraph_content = [] | paragraph_content = [] | ||||
| for run in paragraph.runs: | for run in paragraph.runs: |