* add use layout or not option * trival

1 rok temu · f6aee7f230
--- a/api/apps/conversation_app.py
+++ b/api/apps/conversation_app.py
    for _ in range(len(questions)//2):
        questions.append(questions[-1])
    kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
    if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
        kbinfos = {"total":0, "chunks":[],"doc_aggs":[]}
    else:
        kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
                                    dialog.similarity_threshold,
                                    dialog.vector_similarity_weight, top=1024, aggs=False)
    knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
        if not e:
            return get_data_error_result(retmsg="Document not found!")
        if doc.parser_id.lower() == req["parser_id"].lower():
            return get_json_result(data=True)
            if "parser_config" in req:
                if req["parser_config"] == doc.parser_config:
                    return get_json_result(data=True)
            else: return get_json_result(data=True)
        if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
            return get_data_error_result(retmsg="Not supported yet!")
                                         {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
        if not e:
            return get_data_error_result(retmsg="Document not found!")
        if "parser_config" in req:
            DocumentService.update_parser_config(doc.id, req["parser_config"])
        if doc.token_num > 0:
            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
                                                    doc.process_duation * -1)
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
    drop table llm_factories;
    update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
    update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
    update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture';
    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
    alter table knowledgebase modify avatar longtext;
    alter table user modify avatar longtext;
    alter table dialog modify icon longtext;
 if __name__ == '__main__':
    init_web_db()
    init_web_data()
    add_tenant_llm()
    init_web_data()
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
        if not docs:return
        return docs[0]["tenant_id"]
    @classmethod
    @DB.connection_context()
    def get_thumbnails(cls, docids):
        fields = [cls.model.id, cls.model.thumbnail]
        return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
    @classmethod
    @DB.connection_context()
    def update_parser_config(cls, id, config):
        e, d = cls.get_by_id(id)
        if not e:raise LookupError(f"Document({id}) not found.")
        def dfs_update(old, new):
            for k,v in new.items():
                if k not in old:
                    old[k] = v
                    continue
                if isinstance(v, dict):
                    assert isinstance(old[k], dict)
                    dfs_update(old[k], v)
                else: old[k] = v
        dfs_update(d.parser_config, config)
        cls.update_by_id(id, {"parser_config": d.parser_config})
--- a/api/settings.py
+++ b/api/settings.py
 IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
 PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
--- a/deepdoc/parser/__init__.py
+++ b/deepdoc/parser/__init__.py
 from .pdf_parser import HuParser as PdfParser
 from .pdf_parser import HuParser as PdfParser, PlainParser
 from .docx_parser import HuDocxParser as DocxParser
 from .excel_parser import HuExcelParser as ExcelParser
 from .ppt_parser import HuPptParser as PptParser
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
        return poss
 class PlainParser(object):
    def __call__(self, filename, **kwargs):
        self.outlines = []
        lines = []
        try:
            self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
            outlines = self.pdf.outline
            for page in self.pdf.pages:
                lines.extend([t for t in page.extract_text().split("\n")])
            def dfs(arr, depth):
                for a in arr:
                    if isinstance(a, dict):
                        self.outlines.append((a["/Title"], depth))
                        continue
                    dfs(a, depth + 1)
            dfs(outlines, 0)
        except Exception as e:
            logging.warning(f"Outlines exception: {e}")
        if not self.outlines:
            logging.warning(f"Miss outlines")
        return [(l, "") for l in lines], []
    def crop(self, ck, need_position):
        raise NotImplementedError
    @staticmethod
    def remove_tag(txt):
        raise NotImplementedError
 if __name__ == "__main__":
    pass
--- a/rag/app/book.py
+++ b/rag/app/book.py
 #
 import copy
 import re
 from io import BytesIO
 from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks
 from rag.nlp import huqie
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser import PdfParser, DocxParser, PlainParser
 class Pdf(PdfParser):
        sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
        remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                         from_page=from_page, to_page=to_page, callback=callback)
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        sections = [(l,"") for l in sections if l]
        remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
        callback(0.8, "Finish parsing.")
    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
    make_colon_as_title(sections)
    bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
    if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
    if bull >= 0:
        chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
    else:
        sections = [s.split("@") for s,_ in sections]
        sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
        cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。；！？"))
        chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。；！？"))
    # is it English
    eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    # wrap up to es documents
    for ck in cks:
        d = copy.deepcopy(doc)
        ck = "\n".join(ck)
        if pdf_parser:
            d["image"], poss = pdf_parser.crop(ck, need_position=True)
            add_positions(d, poss)
            ck = pdf_parser.remove_tag(ck)
        tokenize(d, ck, eng)
        res.append(d)
    return res
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
 from io import BytesIO
 from docx import Document
 from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
    make_colon_as_title, add_positions
    make_colon_as_title, add_positions, tokenize_chunks
 from rag.nlp import huqie
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser import PdfParser, DocxParser, PlainParser
 from rag.settings import cron_logger
        callback(0.8, "Text extraction finished")
        return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
        for txt in Docx()(filename, binary):
            sections.append(txt)
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        for txt in pdf_parser(filename if not binary else binary,
                         from_page=from_page, to_page=to_page, callback=callback):
            sections.append(txt)
            pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
            for txt, poss in pdf_parser(filename if not binary else binary,
                             from_page=from_page, to_page=to_page, callback=callback):
                sections.append(txt + poss)
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
    make_colon_as_title(sections)
    bull = bullets_category(sections)
    cks = hierarchical_merge(bull, sections, 3)
    if not cks: callback(0.99, "No chunk parsed out.")
    res = []
    # wrap up to es documents
    for ck in cks:
        print("\n-".join(ck))
        ck = "\n".join(ck)
        d = copy.deepcopy(doc)
        if pdf_parser:
            d["image"], poss = pdf_parser.crop(ck, need_position=True)
            add_positions(d, poss)
            ck = pdf_parser.remove_tag(ck)
        tokenize(d, ck, eng)
        res.append(d)
    return res
    chunks = hierarchical_merge(bull, sections, 3)
    if not chunks: callback(0.99, "No chunk parsed out.")
    return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
 if __name__ == "__main__":
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
 import re
 from api.db import ParserType
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 from deepdoc.parser import PdfParser, PlainParser
 from rag.utils import num_tokens_from_string
        #        print(b)
        print("OCR:", timer()-start)
        def tag(pn, left, right, top, bottom):
            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
                .format(pn, left, right, top, bottom)
        self._layouts_rec(zoomin)
        callback(0.65, "Layout analysis finished.")
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
        return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
        # set pivot using the most frequent type of title,
        # then merge between 2 pivot
        if len(self.boxes)>0 and len(self.outlines)/len(self.boxes) > 0.1:
    pdf_parser = None
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        cks, tbls = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
        if sections and len(sections[0])<3: cks = [(t, l, [0]*5) for t, l in sections]
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {
        "docnm_kwd": filename
    # is it English
    eng = lang.lower() == "english"#pdf_parser.is_english
    # set pivot using the most frequent type of title,
    # then merge between 2 pivot
    if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
        max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
        most_level = max(0, max_lvl - 1)
        levels = []
        for txt, _, _ in sections:
            for t, lvl in pdf_parser.outlines:
                tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
                tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
                if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
                    levels.append(lvl)
                    break
            else:
                levels.append(max_lvl + 1)
    else:
        bull = bullets_category([txt for txt,_,_ in sections])
        most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
    assert len(sections) == len(levels)
    sec_ids = []
    sid = 0
    for i, lvl in enumerate(levels):
        if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
        sec_ids.append(sid)
        # print(lvl, self.boxes[i]["text"], most_level, sid)
    sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
    for (img, rows), poss in tbls:
        sections.append((rows if isinstance(rows, str) else rows[0], -1,
                         [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
    def tag(pn, left, right, top, bottom):
        if pn+left+right+top+bottom == 0:
            return ""
        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format(pn, left, right, top, bottom)
    chunks = []
    last_sid = -2
    tk_cnt = 0
    for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
        poss = "\t".join([tag(*pos) for pos in poss])
        if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
            if chunks:
                chunks[-1] += "\n" + txt + poss
                tk_cnt += num_tokens_from_string(txt)
                continue
        chunks.append(txt + poss)
        tk_cnt = num_tokens_from_string(txt)
        if sec_id > -1: last_sid = sec_id
    res = tokenize_table(tbls, doc, eng)
    for ck in cks:
        d = copy.deepcopy(doc)
        d["image"], poss = pdf_parser.crop(ck, need_position=True)
        add_positions(d, poss)
        tokenize(d, pdf_parser.remove_tag(ck), eng)
        res.append(d)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
 #
 import copy
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
 from deepdoc.parser import PdfParser, ExcelParser
 from rag.settings import cron_logger
    """
    eng = lang.lower() == "english"#is_english(cks)
    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
        for txt in laws.Docx()(filename, binary):
            sections.append((txt, ""))
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                              from_page=from_page, to_page=to_page, callback=callback)
        res = tokenize_table(tbls, doc, eng)
    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = ExcelParser()
        sections = [(excel_parser.html(binary), "")]
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        sections = txt.split("\n")
        sections = [(l, "") for l in sections if l]
        callback(0.8, "Finish parsing.")
    else:
        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。；！？"})
    cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。；！？"))
    # wrap up to es documents
    for ck in cks:
        if len(ck.strip()) == 0:continue
        print("--", ck)
        d = copy.deepcopy(doc)
        if pdf_parser:
            try:
                d["image"], poss = pdf_parser.crop(ck, need_position=True)
            except Exception as e:
                continue
            add_positions(d, poss)
            ck = pdf_parser.remove_tag(ck)
        tokenize(d, ck, eng)
        res.append(d)
    chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。；！？"))
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res
--- a/rag/app/one.py
+++ b/rag/app/one.py
 import re
 from rag.app import laws
 from rag.nlp import huqie, tokenize
 from deepdoc.parser import PdfParser, ExcelParser
 from deepdoc.parser import PdfParser, ExcelParser, PlainParser
 class Pdf(PdfParser):
        for (img, rows), poss in tbls:
            sections.append((rows if isinstance(rows, str) else rows[0],
                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
        return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    sections = []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        for txt in laws.Docx()(filename, binary):
            sections.append(txt)
        sections = [txt for txt in laws.Docx()(filename, binary) if txt]
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
        sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
        sections = [s for s, _ in sections if s]
    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = ExcelParser()
        sections = [excel_parser.html(binary)]
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
                    if not l: break
                    txt += l
        sections = txt.split("\n")
        sections = [(l, "") for l in sections if l]
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")
    else:
        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
 from collections import Counter
 from api.db import ParserType
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 from deepdoc.parser import PdfParser, PlainParser
 import numpy as np
 from rag.utils import num_tokens_from_string
            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
        # freq = Counter([b["text"] for b in self.boxes])
        # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
        # i = 0
        # while i < len(self.boxes):
        #     if self.boxes[i]["text"] in garbage \
        #             or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
        #             or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
        #         self.boxes.pop(i)
        #     elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
        #                                                                                                  '1'):
        #         # merge within same layouts
        #         self.boxes[i + 1]["top"] = self.boxes[i]["top"]
        #         self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
        #         self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
        #         self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
        #         self.boxes.pop(i)
        #     else:
        #         i += 1
        def _begin(txt):
            return re.match(
    """
    pdf_parser = None
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        paper = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
        if not kwargs.get("parser_config",{}).get("layout_recognize", True):
            pdf_parser = PlainParser()
            paper = {
                "title": filename,
                "authors": " ",
                "abstract": "",
                "sections": pdf_parser(filename if not binary else binary),
                "tables": []
            }
        else:
            pdf_parser = Pdf()
            paper = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
                continue
        chunks.append(txt)
        last_sid = sec_id
    for txt in chunks:
        d = copy.deepcopy(doc)
        d["image"], poss = pdf_parser.crop(txt, need_position=True)
        add_positions(d, poss)
        tokenize(d, pdf_parser.remove_tag(txt), eng)
        res.append(d)
        print("----------------------\n", pdf_parser.remove_tag(txt))
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res
 """
    readed = [0] * len(paper["lines"])
    # find colon firstly
    i = 0
        print(d)
        # d["image"].save(f"./logs/{i}.jpg")
    return res
 """
 if __name__ == "__main__":
    import sys
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
 from rag.nlp import tokenize, is_english
 from rag.nlp import huqie
 from deepdoc.parser import PdfParser, PptParser
 from deepdoc.parser import PdfParser, PptParser, PlainParser
 from PyPDF2 import PdfReader as pdf2_read
 class Ppt(PptParser):
        callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
        res = []
        #################### More precisely ###################
        # self._layouts_rec(zoomin)
        # self._text_merge()
        # pages = {}
        # for b in self.boxes:
        #     if self.__garbage(b["text"]):continue
        #     if b["page_number"] not in pages: pages[b["page_number"]] = []
        #     pages[b["page_number"]].append(b["text"])
        # for i, lines in pages.items():
        #     res.append(("\n".join(lines), self.page_images[i-1]))
        # return res
        ########################################
        for i in range(len(self.boxes)):
            lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
            res.append((lines, self.page_images[i]))
        return res
 class PlainPdf(PlainParser):
    def __call__(self, filename, binary=None, callback=None, **kwargs):
        self.pdf = pdf2_read(filename if not binary else BytesIO(filename))
        page_txt = []
        for page in self.pdf.pages:
            page_txt.append(page.extract_text())
        callback(0.9, "Parsing finished")
        return [(txt, None) for txt in page_txt]
 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
    The supported file formats are pdf, pptx.
            res.append(d)
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
        for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
            d = copy.deepcopy(doc)
            pn += from_page
            d["image"] = img
            if img: d["image"] = img
            d["page_num_int"] = [pn+1]
            d["top_int"] = [0]
            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
            tokenize(d, txt, eng)
            res.append(d)
        return res
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 def tokenize_chunks(chunks, doc, eng, pdf_parser):
    res = []
    # wrap up as es documents
    for ck in chunks:
        if len(ck.strip()) == 0:continue
        print("--", ck)
        d = copy.deepcopy(doc)
        if pdf_parser:
            try:
                d["image"], poss = pdf_parser.crop(ck, need_position=True)
                add_positions(d, poss)
                ck = pdf_parser.remove_tag(ck)
            except NotImplementedError as e:
                pass
        tokenize(d, ck, eng)
        res.append(d)
    return res
 def tokenize_table(tbls, doc, eng, batch_size=10):
    res = []
    # add tables
--- a/rag/nlp/huqie.py
+++ b/rag/nlp/huqie.py
    def qieqie(self, tks):
        tks = tks.split(" ")
        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
        if zh_num < len(tks) * 0.2:return " ".join(tks)
        if zh_num < len(tks) * 0.2:
            res = []
            for tk in tks:
                res.extend(tk.split("/"))
            return " ".join(res)
        res = []
        for tk in tks:
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
        s = Search()
        pg = int(req.get("page", 1)) - 1
        ps = int(req.get("size", 1000))
        topk = int(req.get("topk", 1024))
        src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
                                 "image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
                                 "q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
            assert emb_mdl, "No embedding model selected"
            s["knn"] = self._vector(
                qst, emb_mdl, req.get(
                    "similarity", 0.1), ps)
                    "similarity", 0.1), topk)
            s["knn"]["filter"] = bqry.to_dict()
            if "highlight" in s:
                del s["highlight"]
        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
        if not question:
            return ranks
        req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top,
               "question": question, "vector": True,
        req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size,
               "question": question, "vector": True, "topk": top,
               "similarity": similarity_threshold}
        sres = self.search(req, index_name(tenant_id), embd_mdl)
--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
        tsks = []
        if r["type"] == FileType.PDF.value:
            if not r["parser_config"].get("layout_recognize", True):
                tsks.append(new_task())
                continue
            pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
            page_size = 12
            if r["parser_id"] == "paper": page_size = 22
            page_size = r["parser_config"].get("task_page_size", 12)
            if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
            if r["parser_id"] == "one": page_size = 1000000000
            for s,e in r["parser_config"].get("pages", [(0,100000)]):
            for s,e in r["parser_config"].get("pages", [(1, 100000)]):
                s -= 1
                e = min(e, pages)
                for p in range(s, e, page_size):
                    task = new_task()