Bläddra i källkod

clean rag word_extractor. (#19397)

Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
tags/1.4.0
湛露先生 5 månader sedan
förälder
incheckning
1119790b02
Inget konto är kopplat till bidragsgivarens mejladress
1 ändrade filer med 3 tillägg och 4 borttagningar
  1. 3
    4
      api/core/rag/extractor/word_extractor.py

+ 3
- 4
api/core/rag/extractor/word_extractor.py Visa fil

@@ -76,8 +76,7 @@ class WordExtractor(BaseExtractor):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)

def _extract_images_from_docx(self, doc, image_folder):
os.makedirs(image_folder, exist_ok=True)
def _extract_images_from_docx(self, doc):
image_count = 0
image_map = {}

@@ -210,7 +209,7 @@ class WordExtractor(BaseExtractor):

content = []

image_map = self._extract_images_from_docx(doc, image_folder)
image_map = self._extract_images_from_docx(doc)

hyperlinks_url = None
url_pattern = re.compile(r"http://[^\s+]+//|https://[^\s+]+")
@@ -225,7 +224,7 @@ class WordExtractor(BaseExtractor):
xml = ElementTree.XML(run.element.xml)
x_child = [c for c in xml.iter() if c is not None]
for x in x_child:
if x_child is None:
if x is None:
continue
if x.tag.endswith("instrText"):
if x.text is None:

Laddar…
Avbryt
Spara