### What problem does this PR solve? Support displaying images in chunks of docx files when using general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.8.0
| @@ -16,16 +16,28 @@ from docx import Document | |||
| from timeit import default_timer as timer | |||
| import re | |||
| from deepdoc.parser.pdf_parser import PlainParser | |||
| from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec | |||
| from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser | |||
| from rag.settings import cron_logger | |||
| from rag.utils import num_tokens_from_string | |||
| from PIL import Image | |||
| from functools import reduce | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| def get_picture(self, document, paragraph): | |||
| img = paragraph._element.xpath('.//pic:pic') | |||
| if not img: | |||
| return None | |||
| img = img[0] | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)).convert('RGB') | |||
| return image | |||
| def __clean(self, line): | |||
| line = re.sub(r"\u3000", " ", line).strip() | |||
| return line | |||
| @@ -35,17 +47,41 @@ class Docx(DocxParser): | |||
| filename) if not binary else Document(BytesIO(binary)) | |||
| pn = 0 | |||
| lines = [] | |||
| last_image = None | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| lines.append(self.__clean(p.text)) | |||
| if from_page <= pn < to_page: | |||
| current_image = None | |||
| if p.text.strip(): | |||
| if p.style.name == 'Caption': | |||
| former_image = None | |||
| if lines and lines[-1][1] and lines[-1][2] != 'Caption': | |||
| former_image = lines[-1][1].pop() | |||
| elif last_image: | |||
| former_image = last_image | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), [former_image], p.style.name)) | |||
| else: | |||
| current_image = self.get_picture(self.doc, p) | |||
| image_list = [current_image] | |||
| if last_image: | |||
| image_list.insert(0, last_image) | |||
| last_image = None | |||
| lines.append((self.__clean(p.text), image_list, p.style.name)) | |||
| else: | |||
| if current_image := self.get_picture(self.doc, p): | |||
| if lines: | |||
| lines[-1][1].append(current_image) | |||
| else: | |||
| last_image = current_image | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| pn += 1 | |||
| new_line = [(line[0], reduce(concat_img, line[1])) for line in lines] | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| @@ -64,7 +100,7 @@ class Docx(DocxParser): | |||
| html += "</tr>" | |||
| html += "</table>" | |||
| tbls.append(((None, html), "")) | |||
| return [(l, "") for l in lines if l], tbls | |||
| return new_line, tbls | |||
| class Pdf(PdfParser): | |||
| @@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Docx()(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| res = tokenize_table(tbls, doc, eng) # just for table | |||
| callback(0.8, "Finish parsing.") | |||
| st = timer() | |||
| chunks, images = naive_merge_docx( | |||
| sections, int(parser_config.get( | |||
| "chunk_token_num", 128)), parser_config.get( | |||
| "delimiter", "\n!?。;!?")) | |||
| res.extend(tokenize_chunks_docx(chunks, doc, eng, images)) | |||
| cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf( | |||
| @@ -17,7 +17,7 @@ from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from rag.nlp import rag_tokenizer, tokenize_table | |||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | |||
| from rag.settings import cron_logger | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| @@ -174,26 +174,8 @@ class Docx(DocxParser): | |||
| embed = img.xpath('.//a:blip/@r:embed')[0] | |||
| related_part = document.part.related_parts[embed] | |||
| image = related_part.image | |||
| image = Image.open(BytesIO(image.blob)) | |||
| image = Image.open(BytesIO(image.blob)).convert('RGB') | |||
| return image | |||
| def concat_img(self, img1, img2): | |||
| if img1 and not img2: | |||
| return img1 | |||
| if not img1 and img2: | |||
| return img2 | |||
| if not img1 and not img2: | |||
| return None | |||
| width1, height1 = img1.size | |||
| width2, height2 = img2.size | |||
| new_width = max(width1, width2) | |||
| new_height = height1 + height2 | |||
| new_image = Image.new('RGB', (new_width, new_height)) | |||
| new_image.paste(img1, (0, 0)) | |||
| new_image.paste(img2, (0, height1)) | |||
| return new_image | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): | |||
| self.doc = Document( | |||
| @@ -211,7 +193,7 @@ class Docx(DocxParser): | |||
| if not question_level or question_level > 6: # not a question | |||
| last_answer = f'{last_answer}\n{p_text}' | |||
| current_image = self.get_picture(self.doc, p) | |||
| last_image = self.concat_img(last_image, current_image) | |||
| last_image = concat_img(last_image, current_image) | |||
| else: # is a question | |||
| if last_answer or last_image: | |||
| sum_question = '\n'.join(question_stack) | |||
| @@ -24,6 +24,7 @@ import copy | |||
| import roman_numbers as r | |||
| from word2number import w2n | |||
| from cn2an import cn2an | |||
| from PIL import Image | |||
| all_codecs = [ | |||
| 'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs', | |||
| @@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser): | |||
| return res | |||
| def tokenize_chunks_docx(chunks, doc, eng, images): | |||
| res = [] | |||
| # wrap up as es documents | |||
| for ck, image in zip(chunks, images): | |||
| if len(ck.strip()) == 0:continue | |||
| print("--", ck) | |||
| d = copy.deepcopy(doc) | |||
| d["image"] = image | |||
| tokenize(d, ck, eng) | |||
| res.append(d) | |||
| return res | |||
| def tokenize_table(tbls, doc, eng, batch_size=10): | |||
| res = [] | |||
| # add tables | |||
| @@ -504,4 +518,54 @@ def docx_question_level(p): | |||
| if p.style.name.startswith('Heading'): | |||
| return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip() | |||
| else: | |||
| return 0, re.sub(r"\u3000", " ", p.text).strip() | |||
| return 0, re.sub(r"\u3000", " ", p.text).strip() | |||
| def concat_img(img1, img2): | |||
| if img1 and not img2: | |||
| return img1 | |||
| if not img1 and img2: | |||
| return img2 | |||
| if not img1 and not img2: | |||
| return None | |||
| width1, height1 = img1.size | |||
| width2, height2 = img2.size | |||
| new_width = max(width1, width2) | |||
| new_height = height1 + height2 | |||
| new_image = Image.new('RGB', (new_width, new_height)) | |||
| new_image.paste(img1, (0, 0)) | |||
| new_image.paste(img2, (0, height1)) | |||
| return new_image | |||
| def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): | |||
| if not sections: | |||
| return [] | |||
| cks = [""] | |||
| images = [None] | |||
| tk_nums = [0] | |||
| def add_chunk(t, image, pos=""): | |||
| nonlocal cks, tk_nums, delimiter | |||
| tnum = num_tokens_from_string(t) | |||
| if tnum < 8: | |||
| pos = "" | |||
| if tk_nums[-1] > chunk_token_num: | |||
| if t.find(pos) < 0: | |||
| t += pos | |||
| cks.append(t) | |||
| images.append(image) | |||
| tk_nums.append(tnum) | |||
| else: | |||
| if cks[-1].find(pos) < 0: | |||
| t += pos | |||
| cks[-1] += t | |||
| images[-1] = concat_img(images[-1], image) | |||
| tk_nums[-1] += tnum | |||
| for sec, image in sections: | |||
| add_chunk(sec, image, '') | |||
| return cks, images | |||