1 anno fa · 6999598101
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -5,6 +5,27 @@ from io import BytesIO


 class HuExcelParser:
    def html(self, fnm):
        if isinstance(fnm, str):
            wb = load_workbook(fnm)
        else:
            wb = load_workbook(BytesIO(fnm))
        tb = ""
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            tb += f"<table><caption>{sheetname}</caption><tr>"
            for t in list(rows[0]): tb += f"<th>{t.value}</th>"
            tb += "</tr>"
            for r in list(rows[1:]):
                tb += "<tr>"
                for i,c in enumerate(r):
                    if c.value is None: tb += "<td></td>"
                    else: tb += f"<td>{c.value}</td>"
                tb += "</tr>"
            tb += "</table>\n"
        return tb

    def __call__(self, fnm):
        if isinstance(fnm, str):
            wb = load_workbook(fnm)
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -17,7 +17,6 @@ from rag.nlp import huqie
 from copy import deepcopy
 from huggingface_hub import hf_hub_download


 logging.getLogger("pdfminer").setLevel(logging.WARNING)


@@ -25,7 +24,7 @@ class HuParser:
    def __init__(self):
        self.ocr = OCR()
        if hasattr(self, "model_speciess"):
            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
            self.layouter = LayoutRecognizer("layout." + self.model_speciess)
        else:
            self.layouter = LayoutRecognizer("layout")
        self.tbl_det = TableStructureRecognizer()
@@ -141,7 +140,7 @@ class HuParser:
            for j in range(i, -1, -1):
                # restore the order using th
                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
                        and arr[j + 1]["top"] < arr[j]["top"]\
                        and arr[j + 1]["top"] < arr[j]["top"] \
                        and arr[j + 1]["page_number"] == arr[j]["page_number"]:
                    tmp = arr[j]
                    arr[j] = arr[j + 1]
@@ -278,8 +277,10 @@ class HuParser:

        for b in bxs:
            if not b["text"]:
                left, right, top, bott = b["x0"]*ZM, b["x1"]*ZM, b["top"]*ZM, b["bottom"]*ZM
                b["text"] = self.ocr.recognize(np.array(img), np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
                left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
                b["text"] = self.ocr.recognize(np.array(img),
                                               np.array([[left, top], [right, top], [right, bott], [left, bott]],
                                                        dtype=np.float32))
            del b["txt"]
        bxs = [b for b in bxs if b["text"]]
        if self.mean_height[-1] == 0:
@@ -315,7 +316,8 @@ class HuParser:
        while i < len(bxs) - 1:
            b = bxs[i]
            b_ = bxs[i + 1]
            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
            if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
                                                                                                 "equation"]:
                i += 1
                continue
            if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
@@ -376,9 +378,13 @@ class HuParser:
                b["page_number"] == b_["page_number"] and b_["top"] - \
                b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
                b["page_number"] < b_["page_number"] and abs(
                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
            ]
            if any(feats) and not any(concatting_feats):
            # split features
            detach_feats = [b["x1"] < b_["x0"],
                            b["x0"] > b_["x1"]]
            if (any(feats) and not any(concatting_feats)) or any(detach_feats):
                print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
                i += 1
                continue
            # merge up and down
@@ -503,18 +509,21 @@ class HuParser:
        findit = False
        i = 0
        while i < len(self.boxes):
            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
                            re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
                i += 1
                continue
            findit = True
            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
            self.boxes.pop(i)
            if i >= len(self.boxes): break
            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
                self.boxes[i]["text"].strip().split(" ")[:2])
            while not prefix:
                self.boxes.pop(i)
                if i >= len(self.boxes): break
                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
                    self.boxes[i]["text"].strip().split(" ")[:2])
            self.boxes.pop(i)
            if i >= len(self.boxes) or not prefix: break
            for j in range(i, min(i + 128, len(self.boxes))):
@@ -522,13 +531,13 @@ class HuParser:
                    continue
                for k in range(i, j): self.boxes.pop(i)
                break
        if findit:return
        if findit: return

        page_dirty = [0] * len(self.page_images)
        for b in self.boxes:
            if re.search(r"(··|··|··)", b["text"]):
                page_dirty[b["page_number"]-1] += 1
        page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
                page_dirty[b["page_number"] - 1] += 1
        page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
        if not page_dirty: return
        i = 0
        while i < len(self.boxes):
@@ -546,7 +555,7 @@ class HuParser:
                self.boxes.pop(i)
                continue
            if not b_["text"].strip():
                self.boxes.pop(i+1)
                self.boxes.pop(i + 1)
                continue

            if b["text"].strip()[0] != b_["text"].strip()[0] \
@@ -574,8 +583,10 @@ class HuParser:
                continue
            lout_no = str(self.boxes[i]["page_number"]) + \
                      "-" + str(self.boxes[i]["layoutno"])
            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
                                                                                  "figure caption", "reference"]:
            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
                                                                                                      "title",
                                                                                                      "figure caption",
                                                                                                      "reference"]:
                nomerge_lout_no.append(lst_lout_no)
            if self.boxes[i]["layout_type"] == "table":
                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
@@ -654,7 +665,7 @@ class HuParser:

            tk, tv = nearest(tables)
            fk, fv = nearest(figures)
            #if min(tv, fv) > 2000:
            # if min(tv, fv) > 2000:
            #    i += 1
            #    continue
            if tv < fv and tk:
@@ -699,7 +710,7 @@ class HuParser:
                            "layoutno", "")))

                left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
                poss.append((pn+self.page_from, left, right, top, bott))
                poss.append((pn + self.page_from, left, right, top, bott))
                return self.page_images[pn] \
                    .crop((left * ZM, top * ZM,
                           right * ZM, bott * ZM))
@@ -738,7 +749,7 @@ class HuParser:
        for k, bxs in tables.items():
            if not bxs:
                continue
            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
            poss = []
            res.append((cropout(bxs, "table", poss),
                        self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
@@ -879,7 +890,8 @@ class HuParser:
            self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
            self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                enumerate(self.pdf.pages[page_from:page_to])]
            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
                               self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
        except Exception as e:
            self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
@@ -888,8 +900,8 @@ class HuParser:
            mat = fitz.Matrix(zoomin, zoomin)
            self.total_page = len(self.pdf)
            for i, page in enumerate(self.pdf):
                if i < page_from:continue
                if i >= page_to:break
                if i < page_from: continue
                if i >= page_to: break
                pix = page.get_pixmap(matrix=mat)
                img = Image.frombytes("RGB", [pix.width, pix.height],
                                      pix.samples)
@@ -897,7 +909,9 @@ class HuParser:
                self.page_chars.append([])

        logging.info("Images converted.")
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
                           range(len(self.page_chars))]
        if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
            self.is_english = True
        else:
@@ -927,11 +941,12 @@ class HuParser:
            #         self.page_cum_height.append(
            #             np.max([c["bottom"] for c in chars]))
            self.__ocr(i + 1, img, chars, zoomin)
            if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
            if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")

        if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
            bxes = [b for bxs in self.boxes for b in bxs]
            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
                                        "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))

        logging.info("Is it English:", self.is_english)

@@ -964,12 +979,13 @@ class HuParser:
            if need_position: return None, None
            return

        max_width = np.max([right-left for (_, left, right, _, _) in poss])
        max_width = np.max([right - left for (_, left, right, _, _) in poss])
        GAP = 6
        pos = poss[0]
        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
        pos = poss[-1]
        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
                     min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))

        positions = []
        for ii, (pns, left, right, top, bottom) in enumerate(poss):
@@ -984,9 +1000,9 @@ class HuParser:
                    bottom, self.page_images[pns[0]].size[1])
                                               ))
            )
            if 0 < ii < len(poss)-1:
                positions.append((pns[0]+self.page_from, left, right, top, min(
                    bottom, self.page_images[pns[0]].size[1])/ZM))
            if 0 < ii < len(poss) - 1:
                positions.append((pns[0] + self.page_from, left, right, top, min(
                    bottom, self.page_images[pns[0]].size[1]) / ZM))
            bottom -= self.page_images[pns[0]].size[1]
            for pn in pns[1:]:
                imgs.append(
@@ -997,7 +1013,7 @@ class HuParser:
                                               ))
                )
                if 0 < ii < len(poss) - 1:
                    positions.append((pn+self.page_from, left, right, 0, min(
                    positions.append((pn + self.page_from, left, right, 0, min(
                        bottom, self.page_images[pn].size[1]) / ZM))
                bottom -= self.page_images[pn].size[1]

@@ -1026,6 +1042,19 @@ class HuParser:
            return pic, positions
        return pic

    def get_position(self, bx, ZM):
        poss = []
        pn = bx["page_number"]
        top = bx["top"] - self.page_cum_height[pn - 1]
        bott = bx["bottom"] - self.page_cum_height[pn - 1]
        poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
        while bott * ZM > self.page_images[pn - 1].size[1]:
            bott -= self.page_images[pn - 1].size[1] / ZM
            top = 0
            pn += 1
            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
        return poss


 if __name__ == "__main__":
    pass
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -30,19 +30,6 @@ class Pdf(PdfParser):
        #        print(b)
        print("OCR:", timer()-start)

        def get_position(bx):
            poss = []
            pn = bx["page_number"]
            top = bx["top"] - self.page_cum_height[pn - 1]
            bott = bx["bottom"] - self.page_cum_height[pn - 1]
            poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn-1].size[1]/zoomin)))
            while bott * zoomin > self.page_images[pn - 1].size[1]:
                bott -= self.page_images[pn- 1].size[1] / zoomin
                top = 0
                pn += 1
                poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / zoomin)))
            return poss

        def tag(pn, left, right, top, bottom):
            return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
                .format(pn, left, right, top, bottom)
@@ -54,7 +41,7 @@ class Pdf(PdfParser):
        callback(0.67, "Table analysis finished.")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
        self._naive_vertical_merge()
        self._concat_downward()
        self._filter_forpages()
        callback(0.68, "Text merging finished")

@@ -74,7 +61,7 @@ class Pdf(PdfParser):
            sec_ids.append(sid)
            #print(lvl, self.boxes[i]["text"], most_level)

        sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
        sections = [(b["text"], sec_ids[i], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
            sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))

--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -14,7 +14,7 @@ import copy
 import re
 from rag.app import laws
 from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
 from deepdoc.parser import PdfParser
 from deepdoc.parser import PdfParser, ExcelParser
 from rag.settings import cron_logger


@@ -74,6 +74,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        sections, tbls = pdf_parser(filename if not binary else binary,
                              from_page=from_page, to_page=to_page, callback=callback)
        res = tokenize_table(tbls, doc, eng)
    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = ExcelParser()
        sections = [(excel_parser.html(binary), "")]
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -15,7 +15,7 @@ import re
 from collections import Counter

 from api.db import ParserType
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions
 from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@@ -46,11 +46,11 @@ class Pdf(PdfParser):
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
        self._concat_downward(concat_between_pages=False)
        self._concat_downward()
        self._filter_forpages()
        callback(0.75, "Text merging finished.")
        tbls = self._extract_table_figure(True, zoomin, True, True)

        # clean mess
        if column_width < self.page_images[0].size[0] / zoomin / 2:
@@ -59,24 +59,24 @@ class Pdf(PdfParser):
            self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
        freq = Counter([b["text"] for b in self.boxes])
        garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
        i = 0
        while i < len(self.boxes):
            if self.boxes[i]["text"] in garbage \
                    or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
                    or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
                self.boxes.pop(i)
            elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
                                                                                                         '1'):
                # merge within same layouts
                self.boxes[i + 1]["top"] = self.boxes[i]["top"]
                self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
                self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
                self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
                self.boxes.pop(i)
            else:
                i += 1
        # freq = Counter([b["text"] for b in self.boxes])
        # garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
        # i = 0
        # while i < len(self.boxes):
        #     if self.boxes[i]["text"] in garbage \
        #             or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
        #             or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
        #         self.boxes.pop(i)
        #     elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
        #                                                                                                  '1'):
        #         # merge within same layouts
        #         self.boxes[i + 1]["top"] = self.boxes[i]["top"]
        #         self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
        #         self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
        #         self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
        #         self.boxes.pop(i)
        #     else:
        #         i += 1

        def _begin(txt):
            return re.match(
@@ -88,7 +88,7 @@ class Pdf(PdfParser):
                "title":"",
                "authors": "",
                "abstract": "",
                "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
                          re.match(r"(text|title)", b.get("layoutno", "text"))],
                "tables": tbls
            }
@@ -119,11 +119,10 @@ class Pdf(PdfParser):
            if re.match("(abstract|摘要)", txt):
                if len(txt.split(" ")) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(b, zoomin)
                    i += 1
                    break
                txt = self.boxes[i + 1]["text"].lower().strip()
                txt = self.boxes[i]["text"].lower().strip()
                if len(txt.split(" ")) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                i += 1
                break
        if not abstr: i = 0
@@ -136,7 +135,7 @@ class Pdf(PdfParser):
            "title": title if title else filename,
            "authors": " ".join(authors),
            "abstract": abstr,
            "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
                      re.match(r"(text|title)", b.get("layoutno", "text"))],
            "tables": tbls
        }
@@ -153,7 +152,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        paper = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],

    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
           "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
@@ -173,6 +173,38 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        tokenize(d, txt, eng)
        res.append(d)

    sorted_sections = paper["sections"]
    # set pivot using the most frequent type of title,
    # then merge between 2 pivot
    bull = bullets_category([txt for txt, _ in sorted_sections])
    most_level, levels = title_frequency(bull, sorted_sections)
    assert len(sorted_sections) == len(levels)
    sec_ids = []
    sid = 0
    for i, lvl in enumerate(levels):
        if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
        sec_ids.append(sid)
        print(lvl, sorted_sections[i][0], most_level, sid)

    chunks = []
    last_sid = -2
    for (txt, _), sec_id in zip(sorted_sections, sec_ids):
        if sec_id == last_sid:
            if chunks:
                chunks[-1] += "\n" + txt
                continue
        chunks.append(txt)
        last_sid = sec_id
    for txt in chunks:
        d = copy.deepcopy(doc)
        d["image"], poss = pdf_parser.crop(txt, need_position=True)
        add_positions(d, poss)
        tokenize(d, pdf_parser.remove_tag(txt), eng)
        res.append(d)
        print("----------------------\n", pdf_parser.remove_tag(txt))

    return res

    readed = [0] * len(paper["lines"])
    # find colon firstly
    i = 0
@@ -252,6 +284,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
    def dummy(a, b):
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -16,7 +16,7 @@ from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices
 from rag.nlp import huqie, stemmer
 from rag.nlp import huqie
 from deepdoc.parser import ExcelParser


@@ -73,12 +73,8 @@ def beAdoc(d, q, a, eng):
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join(
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    if eng:
        d["content_ltks"] = " ".join([stemmer.stem(w)
                                     for w in word_tokenize(q)])
    else:
        d["content_ltks"] = huqie.qie(q)
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    d["content_ltks"] = huqie.qie(q)
    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    return d


--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -74,9 +74,9 @@ def trans_datatime(s):

 def trans_bool(s):
    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
        return ["yes", "是"]
        return "yes"
    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
        return ["no", "否"]
        return "no"


 def column_data_type(arr):
@@ -92,7 +92,7 @@ def column_data_type(arr):
            counts["int"] += 1
        elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
            counts["float"] += 1
        elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
        elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE):
            counts["bool"] += 1
        elif trans_datatime(str(a)):
            counts["datetime"] += 1
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -3,14 +3,9 @@ from collections import Counter

 from rag.utils import num_tokens_from_string
 from . import huqie
 from nltk import word_tokenize
 import re
 import copy

 from nltk.stem import PorterStemmer

 stemmer = PorterStemmer()


 BULLET_PATTERN = [[
    r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
@@ -77,13 +72,8 @@ def is_english(texts):
 def tokenize(d, t, eng):
    d["content_with_weight"] = t
    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
    if eng:
        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
        d["content_ltks"] = " ".join([stemmer.stem(w)
                                     for w in word_tokenize(t)])
    else:
        d["content_ltks"] = huqie.qie(t)
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    d["content_ltks"] = huqie.qie(t)
    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


 def tokenize_table(tbls, doc, eng, batch_size=10):
@@ -94,8 +84,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            continue
        if isinstance(rows, str):
            d = copy.deepcopy(doc)
            r = re.sub(r"<[^<>]{,12}>", "", rows)
            tokenize(d, r, eng)
            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
            d["image"] = img
            add_positions(d, poss)
--- a/rag/nlp/huqie.py
+++ b/rag/nlp/huqie.py
@@ -8,7 +8,8 @@ import re
 import string
 import sys
 from hanziconv import HanziConv

 from nltk import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
 from api.utils.file_utils import get_project_base_directory


@@ -45,6 +46,9 @@ class Huqie:
        self.trie_ = datrie.Trie(string.printable)
        self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")

        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
        try:
            self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
@@ -239,6 +243,10 @@ class Huqie:
    def qie(self, line):
        line = self._strQ2B(line).lower()
        line = self._tradi2simp(line)
        zh_num = len([1 for c in line if is_chinese(c)])
        if zh_num < len(line) * 0.2:
            return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])

        arr = re.split(self.SPLIT_CHAR, line)
        res = []
        for L in arr:
@@ -290,8 +298,12 @@ class Huqie:
        return self.merge_(res)

    def qieqie(self, tks):
        tks = tks.split(" ")
        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
        if zh_num < len(tks) * 0.2:return " ".join(tks)

        res = []
        for tk in tks.split(" "):
        for tk in tks:
            if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
                res.append(tk)
                continue
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -4,8 +4,8 @@ import json
 import re
 import logging
 import copy
 import math
 from elasticsearch_dsl import Q, Search
 from elasticsearch_dsl import Q

 from rag.nlp import huqie, term_weight, synonym


@@ -33,12 +33,14 @@ class EsQueryer:

    @staticmethod
    def rmWWW(txt):
        txt = re.sub(
            r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*",
            "",
            txt)
        return re.sub(
            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
        patts = [
            (r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
            (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
        ]
        for r, p in patts:
            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
        return txt

    def question(self, txt, tbl="qa", min_match="60%"):
        txt = re.sub(
@@ -50,7 +52,7 @@ class EsQueryer:
        txt = EsQueryer.rmWWW(txt)

        if not self.isChinese(txt):
            tks = [t for t in txt.split(" ") if t.strip()]
            tks = huqie.qie(txt).split(" ")
            q = tks
            for i in range(1, len(tks)):
                q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@@ -58,9 +60,9 @@ class EsQueryer:
                q.append(txt)
            return Q("bool",
                     must=Q("query_string", fields=self.flds,
                            type="best_fields", query=" OR ".join(q),
                            type="best_fields", query=" ".join(q),
                            boost=1, minimum_should_match=min_match)
                     ), txt.split(" ")
                     ), tks

        def needQieqie(tk):
            if len(tk) < 4:
@@ -160,8 +162,8 @@ class EsQueryer:
                s += v# * dtwt[k]
        q = 1e-9
        for k, v in qtwt.items():
            q += v * v
        d = 1e-9
        for k, v in dtwt.items():
            d += v * v
        return s / q#math.sqrt(q) / math.sqrt(d)
            q += v #* v
        #d = 1e-9
        #for k, v in dtwt.items():
        #    d += v * v
        return s / q #math.sqrt(q) / math.sqrt(d)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -196,7 +196,24 @@ class Dealer:
    def insert_citations(self, answer, chunks, chunk_v,
                         embd_mdl, tkweight=0.7, vtweight=0.3):
        assert len(chunks) == len(chunk_v)
        pieces = re.split(r"([；。？!！\n]|[a-z][.?;!][ \n])", answer)
        pieces = re.split(r"(```)", answer)
        if len(pieces) >= 3:
            i = 0
            pieces_ = []
            while i < len(pieces):
                if pieces[i] == "```":
                    st = i
                    i += 1
                    while i<len(pieces) and pieces[i] != "```":
                        i += 1
                    if i < len(pieces): i += 1
                    pieces_.append("".join(pieces[st: i])+"\n")
                else:
                    pieces_.extend(re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]))
                    i += 1
            pieces = pieces_
        else:
            pieces = re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", answer)
        for i in range(1, len(pieces)):
            if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
                pieces[i - 1] += pieces[i][0]
@@ -226,7 +243,7 @@ class Dealer:
                                                            chunks_tks,
                                                            tkweight, vtweight)
            mx = np.max(sim) * 0.99
            if mx < 0.66:
            if mx < 0.7:
                continue
            cites[idx[i]] = list(
                set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@@ -249,6 +266,7 @@ class Dealer:

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks"):
        _, keywords = self.qryr.question(query)
        ins_embd = [
            Dealer.trans2floats(
                sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
@@ -258,8 +276,7 @@ class Dealer:
                  for i in sres.ids]
        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                        ins_embd,
                                                        huqie.qie(
                                                            query).split(" "),
                                                        keywords,
                                                        ins_tw, tkweight, vtweight)
        return sim, tksim, vtsim

--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
@@ -82,12 +82,14 @@ def dispatch():
        tsks = []
        if r["type"] == FileType.PDF.value:
            pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
            page_size = 5
            if r["parser_id"] == "paper": page_size = 12
            for s,e in r["parser_config"].get("pages", [(0,100000)]):
                e = min(e, pages)
                for p in range(s, e, 5):
                for p in range(s, e, page_size):
                    task = new_task()
                    task["from_page"] = p
                    task["to_page"] = min(p + 5, e)
                    task["to_page"] = min(p + page_size, e)
                    tsks.append(task)
        elif r["parser_id"] == "table":
                rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))