소스 검색

feat: Add hyperlink parsing to the DOCX document. (#7017)

tags/0.7.0
chenxu9741 1 년 전
부모
커밋
72c75b75cf
No account linked to committer's email address
1개의 변경된 파일28개의 추가작업 그리고 0개의 파일을 삭제
  1. 28
    0
      api/core/rag/extractor/word_extractor.py

+ 28
- 0
api/core/rag/extractor/word_extractor.py 파일 보기

"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
import datetime import datetime
import logging
import mimetypes import mimetypes
import os import os
import re
import tempfile import tempfile
import uuid import uuid
import xml.etree.ElementTree as ET
from urllib.parse import urlparse from urllib.parse import urlparse


import requests import requests
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.model import UploadFile from models.model import UploadFile


logger = logging.getLogger(__name__)


class WordExtractor(BaseExtractor): class WordExtractor(BaseExtractor):
"""Load docx files. """Load docx files.


image_map = self._extract_images_from_docx(doc, image_folder) image_map = self._extract_images_from_docx(doc, image_folder)


hyperlinks_url = None
url_pattern = re.compile(r'http://[^\s+]+//|https://[^\s+]+')
for para in doc.paragraphs:
for run in para.runs:
if run.text and hyperlinks_url:
result = f' [{run.text}]({hyperlinks_url}) '
run.text = result
hyperlinks_url = None
if 'HYPERLINK' in run.element.xml:
try:
xml = ET.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x_child is None:
continue
if x.tag.endswith('instrText'):
for i in url_pattern.findall(x.text):
hyperlinks_url = str(i)
except Exception as e:
logger.error(e)




def parse_paragraph(paragraph): def parse_paragraph(paragraph):
paragraph_content = [] paragraph_content = []
for run in paragraph.runs: for run in paragraph.runs:

Loading…
취소
저장