1 рік тому · e6acaf6738
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
 from flask import request
 from flask_login import login_required, current_user
 from elasticsearch_dsl import Q
 from rag.app.qa import rmPrefix, beAdoc
 from rag.nlp import search, huqie, retrievaler
 from rag.utils import ELASTICSEARCH, rmSpace
 from api.db import LLMType
 from api.db.services.kb_service import KnowledgebaseService
 from api.db import LLMType, ParserType
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import TenantLLMService
 from api.db.services.user_service import UserTenantService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
        res["chunk_id"] = id
        k = []
        for n in res.keys():
            if re.search(r"(_vec$|_sm_)", n):
            if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
                k.append(n)
            if re.search(r"(_tks|_ltks)", n):
                res[n] = rmSpace(res[n])
        for n in k:
            del res[n]
@manager.route('/set', methods=['POST'])
@login_required
@validate_request("doc_id", "chunk_id", "content_ltks",
@validate_request("doc_id", "chunk_id", "content_with_weight",
                  "important_kwd")
 def set():
    req = request.json
    d = {"id": req["chunk_id"]}
    d["content_ltks"] = huqie.qie(req["content_ltks"])
    d["content_ltks"] = huqie.qie(req["content_with_weight"])
    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    d["important_kwd"] = req["important_kwd"]
    d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
        e, doc = DocumentService.get_by_id(req["doc_id"])
        if not e:
            return get_data_error_result(retmsg="Document not found!")
        if doc.parser_id == ParserType.QA:
            arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
            if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
            q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
            d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
        v, c = embd_mdl.encode([doc.name, req["content_ltks"]])
        v = 0.1 * v[0] + 0.9 * v[1]
        v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
        d["q_%d_vec" % len(v)] = v.tolist()
        ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
        return get_json_result(data=True)
--- a/api/apps/dialog_app.py
+++ b/api/apps/dialog_app.py
 from flask_login import login_required, current_user
 from api.db.services.dialog_service import DialogService
 from api.db import StatusEnum
 from api.db.services.kb_service import KnowledgebaseService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.user_service import TenantService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.utils import get_uuid
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
 from rag.nlp import search
 from rag.utils import ELASTICSEARCH
 from api.db.services import duplicate_name
 from api.db.services.kb_service import KnowledgebaseService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.utils import get_uuid
 from api.db import FileType
 from api.db import FileType, TaskStatus
 from api.db.services.document_service import DocumentService
 from api.settings import RetCode
 from api.utils.api_utils import get_json_result
@manager.route('/run', methods=['POST'])
@login_required
@validate_request("doc_ids", "run")
 def rm():
 def run():
    req = request.json
    try:
        for id in req["doc_ids"]:
            DocumentService.update_by_id(id, {"run": str(req["run"])})
            if req["run"] == "2":
                TaskService.filter_delete([Task.doc_id == id])
            DocumentService.update_by_id(id, {"run": str(req["run"]), "progress": 0})
            if str(req["run"]) == TaskStatus.CANCEL.value:
                tenant_id = DocumentService.get_tenant_id(id)
                if not tenant_id:
                    return get_data_error_result(retmsg="Tenant not found!")
        if doc.parser_id.lower() == req["parser_id"].lower():
            return get_json_result(data=True)
        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": 1})
        if not e:
            return get_data_error_result(retmsg="Document not found!")
        e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""})
        if not e:
            return get_data_error_result(retmsg="Document not found!")
        if doc.token_num>0:
            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
            if not e:
                return get_data_error_result(retmsg="Document not found!")
        return get_json_result(data=True)
    except Exception as e:
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.utils import get_uuid, get_format_time
 from api.db import StatusEnum, UserTenantRole
 from api.db.services.kb_service import KnowledgebaseService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.db_models import Knowledgebase
 from api.settings import stat_logger, RetCode
 from api.utils.api_utils import get_json_result
--- a/api/apps/llm_app.py
+++ b/api/apps/llm_app.py
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
 from api.utils import get_uuid, get_format_time
 from api.db import StatusEnum, UserTenantRole
 from api.db.services.kb_service import KnowledgebaseService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.db_models import Knowledgebase, TenantLLM
 from api.settings import stat_logger, RetCode
 from api.utils.api_utils import get_json_result
--- a/api/db/__init__.py
+++ b/api/db/__init__.py
    CUSTOM = 'Custom'
 class TaskStatus(StrEnum):
    RUNNING = "1"
    CANCEL = "2"
    DONE = "3"
    FAIL = "4"
 class ParserType(StrEnum):
    GENERAL = "general"
    PRESENTATION = "presentation"
    LAWS = "laws"
    MANUAL = "manual"
    PAPER = "paper"
    RESUME = ""
    BOOK = ""
    QA = ""
    RESUME = "resume"
    BOOK = "book"
    QA = "qa"
--- a/api/db/db_utils.py
+++ b/api/db/db_utils.py
    DB.create_tables([model])
    for data in data_source:
        current_time = current_timestamp()
    for i,data in enumerate(data_source):
        current_time = current_timestamp() + i
        current_date = timestamp_to_date(current_time)
        if 'create_time' not in data:
            data['create_time'] = current_time
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
 #
 from peewee import Expression
 from api.db import TenantPermission, FileType
 from api.db import TenantPermission, FileType, TaskStatus
 from api.db.db_models import DB, Knowledgebase, Tenant
 from api.db.db_models import Document
 from api.db.services.common_service import CommonService
 from api.db.services.kb_service import KnowledgebaseService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db import StatusEnum
                ~(cls.model.type == FileType.VIRTUAL.value),
                cls.model.progress == 0,
                cls.model.update_time >= tm,
                cls.model.run == TaskStatus.RUNNING.value,
                (Expression(cls.model.create_time, "%%", comm) == mod))\
            .order_by(cls.model.update_time.asc())\
            .paginate(1, items_per_page)
--- a/api/db/services/knowledgebase_service.py
+++ b/api/db/services/knowledgebase_service.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from api.db.db_models import Knowledgebase, Document
 from api.db import StatusEnum, TenantPermission
 from api.db.db_models import Knowledgebase, DB, Tenant
 from api.db.services.common_service import CommonService
 class KnowledgebaseService(CommonService):
    model = Knowledgebase
    @classmethod
    @DB.connection_context()
    def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
                          page_number, items_per_page, orderby, desc):
        kbs = cls.model.select().where(
            ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
             TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
            & (cls.model.status == StatusEnum.VALID.value)
        )
        if desc:
            kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
        else:
            kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
 class DocumentService(CommonService):
    model = Document
        kbs = kbs.paginate(page_number, items_per_page)
        return list(kbs.dicts())
    @classmethod
    @DB.connection_context()
    def get_detail(cls, kb_id):
        fields = [
            cls.model.id,
            Tenant.embd_id,
            cls.model.avatar,
            cls.model.name,
            cls.model.description,
            cls.model.permission,
            cls.model.doc_num,
            cls.model.token_num,
            cls.model.chunk_num,
            cls.model.parser_id]
        kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where(
            (cls.model.id == kb_id),
            (cls.model.status == StatusEnum.VALID.value)
        )
        if not kbs:
            return
        d = kbs[0].to_dict()
        d["embd_id"] = kbs[0].tenant.embd_id
        return d
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
 #
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from peewee import Expression
 from api.db.db_models import DB
 from api.db import StatusEnum, FileType
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 class TaskService(CommonService):
    model = Task
    @classmethod
    @DB.connection_context()
    def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
        fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
        docs = cls.model.select(*fields) \
            .join(Document, on=(cls.model.doc_id == Document.id)) \
            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
            .where(
                Document.status == StatusEnum.VALID.value,
                ~(Document.type == FileType.VIRTUAL.value),
                cls.model.progress == 0,
                cls.model.update_time >= tm,
                (Expression(cls.model.create_time, "%%", comm) == mod))\
            .order_by(cls.model.update_time.asc())\
            .paginate(1, items_per_page)
        return list(docs.dicts())
    @classmethod
    @DB.connection_context()
    def do_cancel(cls, id):
        try:
            cls.model.get_by_id(id)
            return False
        except Exception as e:
            pass
        return True
 #
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from peewee import Expression
 from api.db.db_models import DB
 from api.db import StatusEnum, FileType, TaskStatus
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
 class TaskService(CommonService):
    model = Task
    @classmethod
    @DB.connection_context()
    def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
        fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
        docs = cls.model.select(*fields) \
            .join(Document, on=(cls.model.doc_id == Document.id)) \
            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
            .where(
                Document.status == StatusEnum.VALID.value,
                ~(Document.type == FileType.VIRTUAL.value),
                cls.model.progress == 0,
                cls.model.update_time >= tm,
                (Expression(cls.model.create_time, "%%", comm) == mod))\
            .order_by(cls.model.update_time.asc())\
            .paginate(1, items_per_page)
        return list(docs.dicts())
    @classmethod
    @DB.connection_context()
    def do_cancel(cls, id):
        try:
            task = cls.model.get_by_id(id)
            _, doc = DocumentService.get_by_id(task.doc_id)
            return doc.run == TaskStatus.CANCEL.value
        except Exception as e:
            pass
        return True
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
    if re.match(r".*\.pdf$", filename):
        return FileType.PDF.value
    if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename):
    if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
        return FileType.DOC.value
    if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
--- a/rag/app/__init__.py
+++ b/rag/app/__init__.py
 from rag.nlp import stemmer, huqie
 def callback__(progress, msg, func):
    if not func :return
    func(progress, msg)
 BULLET_PATTERN = [[
        r"第[零一二三四五六七八九十百]+编",
        r"第[零一二三四五六七八九十百]+(编|部分)",
        r"第[零一二三四五六七八九十百]+章",
        r"第[零一二三四五六七八九十百]+节",
        r"第[零一二三四五六七八九十百]+条",
        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
    ], [
        r"第[零一二三四五六七八九十百]+章",
        r"第[零一二三四五六七八九十百]+节",
        r"[零一二三四五六七八九十百]+[ 、]",
        r"[\(（][零一二三四五六七八九十百]+[\)）]",
        r"[\(（][0-9]{,2}[\)）]",
 def is_english(texts):
    eng = 0
    for t in texts:
        if re.match(r"[a-zA-Z]", t.strip()):
        if re.match(r"[a-zA-Z]{2,}", t.strip()):
            eng += 1
    if eng / len(texts) > 0.8:
        return True
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 def remove_contents_table(sections, eng=False):
    i = 0
    while i < len(sections):
        def get(i):
            nonlocal sections
            return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
        if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
            i += 1
            continue
        sections.pop(i)
        if i >= len(sections): break
        prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
        while not prefix:
            sections.pop(i)
            if i >= len(sections): break
            prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
        sections.pop(i)
        if i >= len(sections) or not prefix: break
        for j in range(i, min(i+128, len(sections))):
            if not re.match(prefix, get(j)):
                continue
            for _ in range(i, j):sections.pop(i)
            break
--- a/rag/app/book.py
+++ b/rag/app/book.py
 import copy
 import random
 import re
 from io import BytesIO
 from docx import Document
 import numpy as np
 from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table
 from rag.nlp import huqie
 from rag.parser.docx_parser import HuDocxParser
 from rag.parser.pdf_parser import HuParser
 class Pdf(HuParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        self.__images__(
            filename if not binary else binary,
            zoomin,
            from_page,
            to_page)
        callback(0.1, "OCR finished")
        from timeit import default_timer as timer
        start = timer()
        self._layouts_paddle(zoomin)
        callback(0.47, "Layout analysis finished")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
        self._concat_downward(concat_between_pages=False)
        self._filter_forpages()
        self._merge_with_same_bullet()
        callback(0.75, "Text merging finished.")
        tbls = self._extract_table_figure(True, zoomin, False)
        callback(0.8, "Text extraction finished")
        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes]
 def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
    }
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    pdf_parser = None
    sections,tbls = [], []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        doc_parser = HuDocxParser()
        # TODO: table of contents need to be removed
        sections, tbls = doc_parser(binary if binary else filename)
        remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        sections,tbls = pdf_parser(filename if not binary else binary,
                         from_page=from_page, to_page=to_page, callback=callback)
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
                    if not l:break
                    txt += l
            sections = txt.split("\n")
        sections = [(l,"") for l in sections if l]
        remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
        callback(0.8, "Finish parsing.")
    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
    bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)])
    projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections)
    levels = [[]] * len(BULLET_PATTERN[bull]) + 2
    for i, (txt, layout) in enumerate(sections):
        for j, p in enumerate(BULLET_PATTERN[bull]):
            if re.match(p, txt.strip()):
                projs[i] = j
                levels[j].append(i)
                break
        else:
            if re.search(r"(title|head)", layout):
                projs[i] = BULLET_PATTERN[bull]
                levels[BULLET_PATTERN[bull]].append(i)
            else:
                levels[BULLET_PATTERN[bull] + 1].append(i)
    sections = [t for t,_ in sections]
    def binary_search(arr, target):
        if target > arr[-1]: return len(arr) - 1
        if target > arr[0]: return -1
        s, e = 0, len(arr)
        while e - s > 1:
            i = (e + s) // 2
            if target > arr[i]:
                s = i
                continue
            elif target < arr[i]:
                e = i
                continue
            else:
                assert False
        return s
    cks = []
    readed = [False] * len(sections)
    levels = levels[::-1]
    for i, arr in enumerate(levels):
        for j in arr:
            if readed[j]: continue
            readed[j] = True
            cks.append([j])
            if i + 1 == len(levels) - 1: continue
            for ii in range(i + 1, len(levels)):
                jj = binary_search(levels[ii], j)
                if jj < 0: break
                if jj > cks[-1][-1]: cks[-1].pop(-1)
                cks[-1].append(levels[ii][jj])
    # is it English
    eng = is_english(random.choices(sections, k=218))
    res = []
    # add tables
    for img, rows in tbls:
        bs = 10
        de = ";" if eng else "；"
        for i in range(0, len(rows), bs):
            d = copy.deepcopy(doc)
            r = de.join(rows[i:i + bs])
            r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
            tokenize(d, r, eng)
            d["image"] = img
            res.append(d)
    # wrap up to es documents
    for ck in cks:
        print("\n-".join(ck[::-1]))
        ck = "\n".join(ck[::-1])
        d = copy.deepcopy(doc)
        if pdf_parser:
            d["image"] = pdf_parser.crop(ck)
            ck = pdf_parser.remove_tag(ck)
        tokenize(d, ck, eng)
        res.append(d)
    return res
 if __name__ == "__main__":
    import sys
    chunk(sys.argv[1])
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
 from io import BytesIO
 from docx import Document
 import numpy as np
 from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
 from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize
 from rag.nlp import huqie
 from rag.parser.docx_parser import HuDocxParser
 from rag.parser.pdf_parser import HuParser
            zoomin,
            from_page,
            to_page)
        callback__(0.1, "OCR finished", callback)
        callback(0.1, "OCR finished")
        from timeit import default_timer as timer
        start = timer()
        self._layouts_paddle(zoomin)
        callback__(0.77, "Layout analysis finished", callback)
        callback(0.77, "Layout analysis finished")
        print("paddle layouts:", timer()-start)
        bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
        # is it English
            b["x1"] = max(b["x1"], b_["x1"])
            bxs.pop(i + 1)
        callback__(0.8, "Text extraction finished", callback)
        callback(0.8, "Text extraction finished")
        return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
    pdf_parser = None
    sections = []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback__(0.1, "Start to parse.", callback)
        callback(0.1, "Start to parse.")
        for txt in Docx()(filename, binary):
            sections.append(txt)
        callback__(0.8, "Finish parsing.", callback)
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
        for txt in pdf_parser(filename if not binary else binary,
                         from_page=from_page, to_page=to_page, callback=callback):
            sections.append(txt)
    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback__(0.1, "Start to parse.", callback)
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:txt = binary.decode("utf-8")
        else:
                    txt += l
            sections = txt.split("\n")
        sections = [l for l in sections if l]
        callback__(0.8, "Finish parsing.", callback)
        callback(0.8, "Finish parsing.")
    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
    # is it English
    # Remove 'Contents' part
    i = 0
    while i < len(sections):
        if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])):
        if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)):
            i += 1
            continue
        sections.pop(i)
        for j in range(i, min(i+128, len(sections))):
            if not re.match(prefix, sections[j]):
                continue
            for k in range(i, j):sections.pop(i)
            for _ in range(i, j):sections.pop(i)
            break
    bull = bullets_category(sections)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
 import copy
 import re
 from rag.app import callback__, tokenize
 from rag.app import tokenize
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
 from rag.utils import num_tokens_from_string
            zoomin,
            from_page,
            to_page)
        callback__(0.2, "OCR finished.", callback)
        callback(0.2, "OCR finished.")
        from timeit import default_timer as timer
        start = timer()
        self._layouts_paddle(zoomin)
        callback__(0.5, "Layout analysis finished.", callback)
        callback(0.5, "Layout analysis finished.")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
        callback__(0.7, "Table analysis finished.", callback)
        callback(0.7, "Table analysis finished.")
        self._text_merge()
        self._concat_downward(concat_between_pages=False)
        self._filter_forpages()
        callback__(0.77, "Text merging finished", callback)
        callback(0.77, "Text merging finished")
        tbls = self._extract_table_figure(True, zoomin, False)
        # clean mess
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())
        # merge chunks with the same bullets
        i = 0
        while i + 1 < len(self.boxes):
            b = self.boxes[i]
            b_ = self.boxes[i + 1]
            if b["text"].strip()[0] != b_["text"].strip()[0] \
                    or b["page_number"]!=b_["page_number"] \
                    or b["top"] > b_["bottom"]:
                i += 1
                continue
            b_["text"] = b["text"] + "\n" + b_["text"]
            b_["x0"] = min(b["x0"], b_["x0"])
            b_["x1"] = max(b["x1"], b_["x1"])
            b_["top"] = b["top"]
            self.boxes.pop(i)
        self._merge_with_same_bullet()
        # merge title with decent chunk
        i = 0
        while i + 1 < len(self.boxes):
            b_["top"] = b["top"]
            self.boxes.pop(i)
        callback__(0.8, "Parsing finished", callback)
        callback(0.8, "Parsing finished")
        for b in self.boxes: print(b["text"], b.get("layoutno"))
        print(tbls)
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
 import copy
 import re
 from collections import Counter
 from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
 from rag.nlp import huqie, stemmer
 from rag.parser.docx_parser import HuDocxParser
 from rag.app import tokenize
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
 from nltk.tokenize import word_tokenize
 import numpy as np
 from rag.utils import num_tokens_from_string
            zoomin,
            from_page,
            to_page)
        callback__(0.2, "OCR finished.", callback)
        callback(0.2, "OCR finished.")
        from timeit import default_timer as timer
        start = timer()
        self._layouts_paddle(zoomin)
        callback__(0.47, "Layout analysis finished", callback)
        callback(0.47, "Layout analysis finished")
        print("paddle layouts:", timer() - start)
        self._table_transformer_job(zoomin)
        callback__(0.68, "Table analysis finished", callback)
        callback(0.68, "Table analysis finished")
        self._text_merge()
        column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
        self._concat_downward(concat_between_pages=False)
        self._filter_forpages()
        callback__(0.75, "Text merging finished.", callback)
        callback(0.75, "Text merging finished.")
        tbls = self._extract_table_figure(True, zoomin, False)
        # clean mess
                break
        if not abstr: i = 0
        callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
        callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
        for b in self.boxes: print(b["text"], b.get("layoutno"))
        print(tbls)
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
 from io import BytesIO
 from pptx import Presentation
 from rag.app import callback__, tokenize, is_english
 from rag.app import tokenize, is_english
 from rag.nlp import huqie
 from rag.parser.pdf_parser import HuParser
                if txt: texts.append(txt)
            txts.append("\n".join(texts))
        callback__(0.5, "Text extraction finished.", callback)
        callback(0.5, "Text extraction finished.")
        import aspose.slides as slides
        import aspose.pydrawing as drawing
        imgs = []
                slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
                imgs.append(buffered.getvalue())
        assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
        callback__(0.9, "Image extraction finished", callback)
        callback(0.9, "Image extraction finished")
        self.is_english = is_english(txts)
        return [(txts[i], imgs[i]) for i in range(len(txts))]
    def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
        self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
        callback__(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
        callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
        res = []
        #################### More precisely ###################
        for i in range(len(self.boxes)):
            lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
            res.append((lines, self.page_images[i]))
        callback__(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)), callback)
        callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
        return res
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
 import random
 import re
 from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.app import is_english
 from rag.nlp import huqie, stemmer
 class Excel(object):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
            wb = load_workbook(fnm)
        else:
            wb = load_workbook(BytesIO(binary))
        total = 0
        for sheetname in wb.sheetnames:
            total += len(list(wb[sheetname].rows))
        res, fails = [], []
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            for i, r in enumerate(rows):
                q, a = "", ""
                for cell in r:
                    if not cell.value: continue
                    if not q: q = str(cell.value)
                    elif not a: a = str(cell.value)
                    else: break
                if q and a: res.append((q, a))
                else: fails.append(str(i+1))
                if len(res) % 999 == 0:
                    callback(len(res)*0.6/total, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..."%(",".join(fails[:3])) if fails else "")))
        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        self.is_english = is_english([rmPrefix(q) for q, _ in random.choices(res, k=30) if len(q)>1])
        return res
 def rmPrefix(txt):
    return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
 def beAdoc(d, q, a, eng):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join([qprefix+rmPrefix(q), aprefix+rmPrefix(a)])
    if eng:
        d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(q)])
    else:
        d["content_ltks"] = huqie.qie(q)
        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
    return d
 def chunk(filename, binary=None,  from_page=0, to_page=100000, callback=None):
    res = []
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
        for q,a in excel_parser(filename, binary, callback):
            res.append(beAdoc({}, q, a, excel_parser.is_english))
        return res
    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:
            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
                    if not l: break
                    txt += l
        lines = txt.split("\n")
        eng = is_english([rmPrefix(l) for l in lines[:100]])
        fails = []
        for i, line in enumerate(lines):
            arr = [l for l in line.split("\t") if len(l) > 1]
            if len(arr) != 2:
                fails.append(str(i))
                continue
            res.append(beAdoc({}, arr[0], arr[1], eng))
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        return res
    raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
 if __name__== "__main__":
    import sys
    def kk(rat, ss):
        pass
    print(chunk(sys.argv[1], callback=kk))
--- a/rag/parser/pdf_parser.py
+++ b/rag/parser/pdf_parser.py
            return
        i = 0
        while i < len(self.boxes):
            if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
                i += 1
                continue
            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
                for k in range(i, j): self.boxes.pop(i)
                break
    def _merge_with_same_bullet(self):
        i = 0
        while i + 1 < len(self.boxes):
            b = self.boxes[i]
            b_ = self.boxes[i + 1]
            if b["text"].strip()[0] != b_["text"].strip()[0] \
                    or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
                    or b["top"] > b_["bottom"]:
                i += 1
                continue
            b_["text"] = b["text"] + "\n" + b_["text"]
            b_["x0"] = min(b["x0"], b_["x0"])
            b_["x1"] = max(b["x1"], b_["x1"])
            b_["top"] = b["top"]
            self.boxes.pop(i)
    def _blockType(self, b):
        patt = [
            ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
 #
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import logging
 import os
 import time
 import random
 from timeit import default_timer as timer
 from api.db.db_models import Task
 from api.db.db_utils import bulk_insert_into_db
 from api.db.services.task_service import TaskService
 from rag.parser.pdf_parser import HuParser
 from rag.settings import cron_logger
 from rag.utils import MINIO
 from rag.utils import findMaxTm
 import pandas as pd
 from api.db import FileType
 from api.db.services.document_service import DocumentService
 from api.settings import database_logger
 from api.utils import get_format_time, get_uuid
 from api.utils.file_utils import get_project_base_directory
 def collect(tm):
    docs = DocumentService.get_newly_uploaded(tm)
    if len(docs) == 0:
        return pd.DataFrame()
    docs = pd.DataFrame(docs)
    mtm = docs["update_time"].max()
    cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
    return docs
 def set_dispatching(docid):
    try:
        DocumentService.update_by_id(
            docid, {"progress": random.randint(0, 3) / 100.,
                    "progress_msg": "Task dispatched...",
                    "process_begin_at": get_format_time()
                    })
    except Exception as e:
        cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
 def dispatch():
    tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
    tm = findMaxTm(tm_fnm)
    rows = collect(tm)
    if len(rows) == 0:
        return
    tmf = open(tm_fnm, "a+")
    for _, r in rows.iterrows():
        try:
            tsks = TaskService.query(doc_id=r["id"])
            if tsks:
                for t in tsks:
                    TaskService.delete_by_id(t.id)
        except Exception as e:
            cron_logger.error("delete task exception:" + str(e))
        def new_task():
            nonlocal r
            return {
                "id": get_uuid(),
                "doc_id": r["id"]
            }
        tsks = []
        if r["type"] == FileType.PDF.value:
            pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
            for p in range(0, pages, 10):
                task = new_task()
                task["from_page"] = p
                task["to_page"] = min(p + 10, pages)
                tsks.append(task)
        else:
            tsks.append(new_task())
        print(tsks)
        bulk_insert_into_db(Task, tsks, True)
        set_dispatching(r["id"])
        tmf.write(str(r["update_time"]) + "\n")
    tmf.close()
 def update_progress():
    docs = DocumentService.get_unfinished_docs()
    for d in docs:
        try:
            tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
            if not tsks:continue
            msg = []
            prg = 0
            finished = True
            bad = 0
            for t in tsks:
                if 0 <= t.progress < 1: finished = False
                prg += t.progress if t.progress >= 0 else 0
                msg.append(t.progress_msg)
                if t.progress == -1: bad += 1
            prg /= len(tsks)
            if finished and bad: prg = -1
            msg = "\n".join(msg)
            DocumentService.update_by_id(d["id"], {"progress": prg, "progress_msg": msg, "process_duation": timer()-d["process_begin_at"].timestamp()})
        except Exception as e:
            cron_logger.error("fetch task exception:" + str(e))
 if __name__ == "__main__":
    peewee_logger = logging.getLogger('peewee')
    peewee_logger.propagate = False
    peewee_logger.addHandler(database_logger.handlers[0])
    peewee_logger.setLevel(database_logger.level)
    while True:
        dispatch()
        time.sleep(3)
        update_progress()
 #
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import logging
 import os
 import time
 import random
 from datetime import datetime
 from api.db.db_models import Task
 from api.db.db_utils import bulk_insert_into_db
 from api.db.services.task_service import TaskService
 from rag.parser.pdf_parser import HuParser
 from rag.settings import cron_logger
 from rag.utils import MINIO
 from rag.utils import findMaxTm
 import pandas as pd
 from api.db import FileType, TaskStatus
 from api.db.services.document_service import DocumentService
 from api.settings import database_logger
 from api.utils import get_format_time, get_uuid
 from api.utils.file_utils import get_project_base_directory
 def collect(tm):
    docs = DocumentService.get_newly_uploaded(tm)
    if len(docs) == 0:
        return pd.DataFrame()
    docs = pd.DataFrame(docs)
    mtm = docs["update_time"].max()
    cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
    return docs
 def set_dispatching(docid):
    try:
        DocumentService.update_by_id(
            docid, {"progress": random.randint(0, 3) / 100.,
                    "progress_msg": "Task dispatched...",
                    "process_begin_at": get_format_time()
                    })
    except Exception as e:
        cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
 def dispatch():
    tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
    tm = findMaxTm(tm_fnm)
    rows = collect(tm)
    if len(rows) == 0:
        return
    tmf = open(tm_fnm, "a+")
    for _, r in rows.iterrows():
        try:
            tsks = TaskService.query(doc_id=r["id"])
            if tsks:
                for t in tsks:
                    TaskService.delete_by_id(t.id)
        except Exception as e:
            cron_logger.error("delete task exception:" + str(e))
        def new_task():
            nonlocal r
            return {
                "id": get_uuid(),
                "doc_id": r["id"]
            }
        tsks = []
        if r["type"] == FileType.PDF.value:
            pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
            for p in range(0, pages, 10):
                task = new_task()
                task["from_page"] = p
                task["to_page"] = min(p + 10, pages)
                tsks.append(task)
        else:
            tsks.append(new_task())
        print(tsks)
        bulk_insert_into_db(Task, tsks, True)
        set_dispatching(r["id"])
        tmf.write(str(r["update_time"]) + "\n")
    tmf.close()
 def update_progress():
    docs = DocumentService.get_unfinished_docs()
    for d in docs:
        try:
            tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
            if not tsks:continue
            msg = []
            prg = 0
            finished = True
            bad = 0
            status = TaskStatus.RUNNING.value
            for t in tsks:
                if 0 <= t.progress < 1: finished = False
                prg += t.progress if t.progress >= 0 else 0
                msg.append(t.progress_msg)
                if t.progress == -1: bad += 1
            prg /= len(tsks)
            if finished and bad:
                prg = -1
                status = TaskStatus.FAIL.value
            elif finished: status = TaskStatus.DONE.value
            msg = "\n".join(msg)
            info = {"process_duation": datetime.timestamp(datetime.now())-d["process_begin_at"].timestamp(), "run": status}
            if prg !=0 : info["progress"] = prg
            if msg: info["progress_msg"] = msg
            DocumentService.update_by_id(d["id"], info)
        except Exception as e:
            cron_logger.error("fetch task exception:" + str(e))
 if __name__ == "__main__":
    peewee_logger = logging.getLogger('peewee')
    peewee_logger.propagate = False
    peewee_logger.addHandler(database_logger.handlers[0])
    peewee_logger.setLevel(database_logger.level)
    while True:
        dispatch()
        time.sleep(3)
        update_progress()
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
 from functools import partial
 from timeit import default_timer as timer
 from elasticsearch_dsl import Q
 from api.db.services.task_service import TaskService
 from rag.llm import EmbeddingModel, CvModel
 from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
 from rag.utils import ELASTICSEARCH
 from rag.utils import MINIO
 from io import BytesIO
 import pandas as pd
 from rag.app import laws, paper, presentation, manual
 from rag.app import laws, paper, presentation, manual, qa
 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
    ParserType.PRESENTATION.value: presentation,
    ParserType.MANUAL.value: manual,
    ParserType.LAWS.value: laws,
    ParserType.QA.value: qa,
 }
 def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
    cancel = TaskService.do_cancel(task_id)
    if cancel:
        msg = "Canceled."
        msg += " [Canceled]"
        prog = -1
    if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
 def embedding(docs, mdl):
    tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs]
    tts, cnts = [d["docnm_kwd"] for d in docs if d.get("docnm_kwd")], [d["content_with_weight"] for d in docs]
    tk_count = 0
    tts, c = mdl.encode(tts)
    tk_count += c
    if len(tts) == len(cnts):
        tts, c = mdl.encode(tts)
        tk_count += c
    cnts, c = mdl.encode(cnts)
    tk_count += c
    vects = 0.1 * tts + 0.9 * cnts
    vects = (0.1 * tts + 0.9 * cnts) if len(tts) == len(cnts) else cnts
    assert len(vects) == len(docs)
    for i, d in enumerate(docs):
        v = vects[i].tolist()
        callback(msg="Finished embedding! Start to build index!")
        init_kb(r)
        chunk_count = len(set([c["_id"] for c in cks]))
        callback(1., "Done!")
        es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
        if es_r:
            callback(-1, "Index failure!")
            cron_logger.error(str(es_r))
        else:
            if TaskService.do_cancel(r["id"]):
                ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
            callback(1., "Done!")
            DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
            cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))