| 
                        123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 | 
                        - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import re
 - from copy import deepcopy
 - from io import BytesIO
 - from timeit import default_timer as timer
 - from nltk import word_tokenize
 - from openpyxl import load_workbook
 - from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
 - from rag.nlp import rag_tokenizer, tokenize_table
 - from rag.settings import cron_logger
 - from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 - from docx import Document
 - from PIL import Image
 - class Excel(ExcelParser):
 -     def __call__(self, fnm, binary=None, callback=None):
 -         if not binary:
 -             wb = load_workbook(fnm)
 -         else:
 -             wb = load_workbook(BytesIO(binary))
 -         total = 0
 -         for sheetname in wb.sheetnames:
 -             total += len(list(wb[sheetname].rows))
 - 
 -         res, fails = [], []
 -         for sheetname in wb.sheetnames:
 -             ws = wb[sheetname]
 -             rows = list(ws.rows)
 -             for i, r in enumerate(rows):
 -                 q, a = "", ""
 -                 for cell in r:
 -                     if not cell.value:
 -                         continue
 -                     if not q:
 -                         q = str(cell.value)
 -                     elif not a:
 -                         a = str(cell.value)
 -                     else:
 -                         break
 -                 if q and a:
 -                     res.append((q, a))
 -                 else:
 -                     fails.append(str(i + 1))
 -                 if len(res) % 999 == 0:
 -                     callback(len(res) *
 -                              0.6 /
 -                              total, ("Extract Q&A: {}".format(len(res)) +
 -                                      (f"{len(fails)} failure, line: %s..." %
 -                                       (",".join(fails[:3])) if fails else "")))
 - 
 -         callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
 -             f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 -         self.is_english = is_english(
 -             [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
 -         return res
 - 
 - class Pdf(PdfParser):
 -     def __call__(self, filename, binary=None, from_page=0,
 -                  to_page=100000, zoomin=3, callback=None):
 -         start = timer()
 -         callback(msg="OCR is running...")
 -         self.__images__(
 -             filename if not binary else binary,
 -             zoomin,
 -             from_page,
 -             to_page,
 -             callback
 -         )
 -         callback(msg="OCR finished")
 -         cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
 -         start = timer()
 -         self._layouts_rec(zoomin, drop=False)
 -         callback(0.63, "Layout analysis finished.")
 -         self._table_transformer_job(zoomin)
 -         callback(0.65, "Table analysis finished.")
 -         self._text_merge()
 -         callback(0.67, "Text merging finished")
 -         tbls = self._extract_table_figure(True, zoomin, True, True)
 -         #self._naive_vertical_merge()
 -         # self._concat_downward()
 -         #self._filter_forpages()
 -         cron_logger.info("layouts: {}".format(timer() - start))
 -         sections = [b["text"] for b in self.boxes]
 -         bull_x0_list = []
 -         q_bull, reg = qbullets_category(sections)
 -         if q_bull == -1:
 -             raise ValueError("Unable to recognize Q&A structure.")
 -         qai_list = []
 -         last_q, last_a, last_tag = '', '', ''
 -         last_index = -1
 -         last_box = {'text':''}
 -         last_bull = None
 -         for box in self.boxes:
 -             section, line_tag = box['text'], self._line_tag(box, zoomin)
 -             has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
 -             last_box, last_index, last_bull = box, index, has_bull
 -             if not has_bull:  # No question bullet
 -                 if not last_q:
 -                     continue
 -                 else:
 -                     last_a = f'{last_a}{section}'
 -                     last_tag = f'{last_tag}{line_tag}'
 -             else:
 -                 if last_q:
 -                     qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
 -                     last_q, last_a, last_tag = '', '', ''
 -                 last_q = has_bull.group()
 -                 _, end = has_bull.span()
 -                 last_a = section[end:]
 -                 last_tag = line_tag
 -         if last_q:
 -             qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
 -         return qai_list, tbls
 - class Docx(DocxParser):
 -     def __init__(self):
 -         pass
 -     def get_picture(self, document, paragraph):
 -         img = paragraph._element.xpath('.//pic:pic')
 -         if not img:
 -             return None
 -         img = img[0]
 -         embed = img.xpath('.//a:blip/@r:embed')[0]
 -         related_part = document.part.related_parts[embed]
 -         image = related_part.image
 -         image = Image.open(BytesIO(image.blob))
 -         return image
 -     def concat_img(self, img1, img2):
 -         if img1 and not img2:
 -             return img1
 -         if not img1 and img2:
 -             return img2
 -         if not img1 and not img2:
 -             return None
 -         width1, height1 = img1.size
 -         width2, height2 = img2.size
 - 
 -         new_width = max(width1, width2)
 -         new_height = height1 + height2
 -         new_image = Image.new('RGB', (new_width, new_height))
 - 
 -         new_image.paste(img1, (0, 0))
 -         new_image.paste(img2, (0, height1))
 - 
 -         return new_image
 - 
 -     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
 -         self.doc = Document(
 -             filename) if not binary else Document(BytesIO(binary))
 -         pn = 0
 -         last_answer, last_image = "", None
 -         question_stack, level_stack = [], []
 -         qai_list = []
 -         for p in self.doc.paragraphs:
 -             if pn > to_page:
 -                 break
 -             question_level, p_text = 0, ''
 -             if from_page <= pn < to_page and p.text.strip():
 -                 question_level, p_text = docx_question_level(p)
 -             if not question_level or question_level > 6: # not a question
 -                 last_answer = f'{last_answer}\n{p_text}'
 -                 current_image = self.get_picture(self.doc, p)
 -                 last_image = self.concat_img(last_image, current_image)
 -             else:   # is a question
 -                 if last_answer or last_image:
 -                     sum_question = '\n'.join(question_stack)
 -                     if sum_question:
 -                         qai_list.append((sum_question, last_answer, last_image))
 -                     last_answer, last_image = '', None
 - 
 -                 i = question_level
 -                 while question_stack and i <= level_stack[-1]:
 -                     question_stack.pop()
 -                     level_stack.pop()
 -                 question_stack.append(p_text)
 -                 level_stack.append(question_level)
 -             for run in p.runs:
 -                 if 'lastRenderedPageBreak' in run._element.xml:
 -                     pn += 1
 -                     continue
 -                 if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
 -                     pn += 1
 -         if last_answer:
 -             sum_question = '\n'.join(question_stack)
 -             if sum_question:
 -                 qai_list.append((sum_question, last_answer, last_image))
 -                 
 -         tbls = []
 -         for tb in self.doc.tables:
 -             html= "<table>"
 -             for r in tb.rows:
 -                 html += "<tr>"
 -                 i = 0
 -                 while i < len(r.cells):
 -                     span = 1
 -                     c = r.cells[i]
 -                     for j in range(i+1, len(r.cells)):
 -                         if c.text == r.cells[j].text:
 -                             span += 1
 -                             i = j
 -                     i += 1
 -                     html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
 -                 html += "</tr>"
 -             html += "</table>"
 -             tbls.append(((None, html), ""))
 -         return qai_list, tbls
 - 
 - def rmPrefix(txt):
 -     return re.sub(
 -         r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
 - 
 - 
 - def beAdocPdf(d, q, a, eng, image, poss):
 -     qprefix = "Question: " if eng else "问题:"
 -     aprefix = "Answer: " if eng else "回答:"
 -     d["content_with_weight"] = "\t".join(
 -         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
 -     d["content_ltks"] = rag_tokenizer.tokenize(q)
 -     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
 -     d["image"] = image
 -     add_positions(d, poss)
 -     return d
 - 
 - def beAdocDocx(d, q, a, eng, image):
 -     qprefix = "Question: " if eng else "问题:"
 -     aprefix = "Answer: " if eng else "回答:"
 -     d["content_with_weight"] = "\t".join(
 -         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
 -     d["content_ltks"] = rag_tokenizer.tokenize(q)
 -     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
 -     d["image"] = image
 -     return d
 - 
 - def beAdoc(d, q, a, eng):
 -     qprefix = "Question: " if eng else "问题:"
 -     aprefix = "Answer: " if eng else "回答:"
 -     d["content_with_weight"] = "\t".join(
 -         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
 -     d["content_ltks"] = rag_tokenizer.tokenize(q)
 -     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
 -     return d
 - 
 - 
 - def mdQuestionLevel(s):
 -     match = re.match(r'#*', s)
 -     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
 - 
 - def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
 -     """
 -         Excel and csv(txt) format files are supported.
 -         If the file is in excel format, there should be 2 column question and answer without header.
 -         And question column is ahead of answer column.
 -         And it's O.K if it has multiple sheets as long as the columns are rightly composed.
 - 
 -         If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
 - 
 -         All the deformed lines will be ignored.
 -         Every pair of Q&A will be treated as a chunk.
 -     """
 -     eng = lang.lower() == "english"
 -     res = []
 -     doc = {
 -         "docnm_kwd": filename,
 -         "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
 -     }
 -     if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         excel_parser = Excel()
 -         for q, a in excel_parser(filename, binary, callback):
 -             res.append(beAdoc(deepcopy(doc), q, a, eng))
 -         return res
 -     elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         txt = ""
 -         if binary:
 -             encoding = find_codec(binary)
 -             txt = binary.decode(encoding, errors="ignore")
 -         else:
 -             with open(filename, "r") as f:
 -                 while True:
 -                     l = f.readline()
 -                     if not l:
 -                         break
 -                     txt += l
 -         lines = txt.split("\n")
 -         comma, tab = 0, 0
 -         for l in lines:
 -             if len(l.split(",")) == 2: comma += 1
 -             if len(l.split("\t")) == 2: tab += 1
 -         delimiter = "\t" if tab >= comma else ","
 - 
 -         fails = []
 -         question, answer = "", ""
 -         i = 0
 -         while i < len(lines):
 -             arr = lines[i].split(delimiter)
 -             if len(arr) != 2:
 -                 if question: answer += "\n" + lines[i]
 -                 else:
 -                     fails.append(str(i+1))
 -             elif len(arr) == 2:
 -                 if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
 -                 question, answer = arr
 -             i += 1
 -             if len(res) % 999 == 0:
 -                 callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
 -                     f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 - 
 -         if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
 - 
 -         callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
 -             f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 - 
 -         return res
 -     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         pdf_parser = Pdf()
 -         count = 0
 -         qai_list, tbls = pdf_parser(filename if not binary else binary,
 -                                     from_page=0, to_page=10000, callback=callback)
 -         
 -         res = tokenize_table(tbls, doc, eng)
 - 
 -         for q, a, image, poss in qai_list:
 -             count += 1
 -             res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
 -         return res
 -     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         txt = ""
 -         if binary:
 -             encoding = find_codec(binary)
 -             txt = binary.decode(encoding, errors="ignore")
 -         else:
 -             with open(filename, "r") as f:
 -                 while True:
 -                     l = f.readline()
 -                     if not l:
 -                         break
 -                     txt += l
 -         lines = txt.split("\n")
 -         last_question, last_answer = "", ""
 -         question_stack, level_stack = [], []
 -         code_block = False
 -         level_index = [-1] * 7
 -         for index, l in enumerate(lines):
 -             if not l.strip():
 -                 continue
 -             if l.strip().startswith('```'):
 -                 code_block = not code_block
 -             question_level, question = 0, ''
 -             if not code_block:
 -                 question_level, question = mdQuestionLevel(l)
 - 
 -             if not question_level or question_level > 6: # not a question
 -                 last_answer = f'{last_answer}\n{l}'
 -             else:   # is a question
 -                 if last_answer:
 -                     sum_question = '\n'.join(question_stack)
 -                     if sum_question:
 -                         res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
 -                     last_answer = ''
 - 
 -                 i = question_level
 -                 while question_stack and i <= level_stack[-1]:
 -                     question_stack.pop()
 -                     level_stack.pop()
 -                 question_stack.append(question)
 -                 level_stack.append(question_level)
 -         if last_answer:
 -             sum_question = '\n'.join(question_stack)
 -             if sum_question:
 -                 res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
 -         return res
 -     elif re.search(r"\.docx$", filename, re.IGNORECASE):
 -         docx_parser = Docx()
 -         qai_list, tbls = docx_parser(filename, binary,
 -                                     from_page=0, to_page=10000, callback=callback)
 -         res = tokenize_table(tbls, doc, eng)
 -         for q, a, image in qai_list:
 -             res.append(beAdocDocx(deepcopy(doc), q, a, eng, image))
 -         return res
 - 
 -     raise NotImplementedError(
 -         "Excel, csv(txt), pdf, markdown and docx format files are supported.")
 - 
 - 
 - if __name__ == "__main__":
 -     import sys
 - 
 -     def dummy(prog=None, msg=""):
 -         pass
 -     chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
 
 
  |