| from flask import request | from flask import request | ||||
| from flask_login import login_required, current_user | from flask_login import login_required, current_user | ||||
| from elasticsearch_dsl import Q | from elasticsearch_dsl import Q | ||||
| from rag.app.qa import rmPrefix, beAdoc | |||||
| from rag.nlp import search, huqie, retrievaler | from rag.nlp import search, huqie, retrievaler | ||||
| from rag.utils import ELASTICSEARCH, rmSpace | from rag.utils import ELASTICSEARCH, rmSpace | ||||
| from api.db import LLMType | |||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db import LLMType, ParserType | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.db.services.llm_service import TenantLLMService | from api.db.services.llm_service import TenantLLMService | ||||
| from api.db.services.user_service import UserTenantService | from api.db.services.user_service import UserTenantService | ||||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | ||||
| res["chunk_id"] = id | res["chunk_id"] = id | ||||
| k = [] | k = [] | ||||
| for n in res.keys(): | for n in res.keys(): | ||||
| if re.search(r"(_vec$|_sm_)", n): | |||||
| if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): | |||||
| k.append(n) | k.append(n) | ||||
| if re.search(r"(_tks|_ltks)", n): | |||||
| res[n] = rmSpace(res[n]) | |||||
| for n in k: | for n in k: | ||||
| del res[n] | del res[n] | ||||
| @manager.route('/set', methods=['POST']) | @manager.route('/set', methods=['POST']) | ||||
| @login_required | @login_required | ||||
| @validate_request("doc_id", "chunk_id", "content_ltks", | |||||
| @validate_request("doc_id", "chunk_id", "content_with_weight", | |||||
| "important_kwd") | "important_kwd") | ||||
| def set(): | def set(): | ||||
| req = request.json | req = request.json | ||||
| d = {"id": req["chunk_id"]} | d = {"id": req["chunk_id"]} | ||||
| d["content_ltks"] = huqie.qie(req["content_ltks"]) | |||||
| d["content_ltks"] = huqie.qie(req["content_with_weight"]) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | ||||
| d["important_kwd"] = req["important_kwd"] | d["important_kwd"] = req["important_kwd"] | ||||
| d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) | d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) | ||||
| e, doc = DocumentService.get_by_id(req["doc_id"]) | e, doc = DocumentService.get_by_id(req["doc_id"]) | ||||
| if not e: | if not e: | ||||
| return get_data_error_result(retmsg="Document not found!") | return get_data_error_result(retmsg="Document not found!") | ||||
| if doc.parser_id == ParserType.QA: | |||||
| arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1] | |||||
| if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.") | |||||
| q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] | |||||
| d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a])) | |||||
| v, c = embd_mdl.encode([doc.name, req["content_ltks"]]) | v, c = embd_mdl.encode([doc.name, req["content_ltks"]]) | ||||
| v = 0.1 * v[0] + 0.9 * v[1] | |||||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | |||||
| d["q_%d_vec" % len(v)] = v.tolist() | d["q_%d_vec" % len(v)] = v.tolist() | ||||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | ||||
| return get_json_result(data=True) | return get_json_result(data=True) |
| from flask_login import login_required, current_user | from flask_login import login_required, current_user | ||||
| from api.db.services.dialog_service import DialogService | from api.db.services.dialog_service import DialogService | ||||
| from api.db import StatusEnum | from api.db import StatusEnum | ||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.db.services.user_service import TenantService | from api.db.services.user_service import TenantService | ||||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | ||||
| from api.utils import get_uuid | from api.utils import get_uuid |
| from rag.nlp import search | from rag.nlp import search | ||||
| from rag.utils import ELASTICSEARCH | from rag.utils import ELASTICSEARCH | ||||
| from api.db.services import duplicate_name | from api.db.services import duplicate_name | ||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | ||||
| from api.utils import get_uuid | from api.utils import get_uuid | ||||
| from api.db import FileType | |||||
| from api.db import FileType, TaskStatus | |||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| from api.settings import RetCode | from api.settings import RetCode | ||||
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result | ||||
| @manager.route('/run', methods=['POST']) | @manager.route('/run', methods=['POST']) | ||||
| @login_required | @login_required | ||||
| @validate_request("doc_ids", "run") | @validate_request("doc_ids", "run") | ||||
| def rm(): | |||||
| def run(): | |||||
| req = request.json | req = request.json | ||||
| try: | try: | ||||
| for id in req["doc_ids"]: | for id in req["doc_ids"]: | ||||
| DocumentService.update_by_id(id, {"run": str(req["run"])}) | |||||
| if req["run"] == "2": | |||||
| TaskService.filter_delete([Task.doc_id == id]) | |||||
| DocumentService.update_by_id(id, {"run": str(req["run"]), "progress": 0}) | |||||
| if str(req["run"]) == TaskStatus.CANCEL.value: | |||||
| tenant_id = DocumentService.get_tenant_id(id) | tenant_id = DocumentService.get_tenant_id(id) | ||||
| if not tenant_id: | if not tenant_id: | ||||
| return get_data_error_result(retmsg="Tenant not found!") | return get_data_error_result(retmsg="Tenant not found!") | ||||
| if doc.parser_id.lower() == req["parser_id"].lower(): | if doc.parser_id.lower() == req["parser_id"].lower(): | ||||
| return get_json_result(data=True) | return get_json_result(data=True) | ||||
| e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": 1}) | |||||
| if not e: | |||||
| return get_data_error_result(retmsg="Document not found!") | |||||
| e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) | |||||
| e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""}) | |||||
| if not e: | if not e: | ||||
| return get_data_error_result(retmsg="Document not found!") | return get_data_error_result(retmsg="Document not found!") | ||||
| if doc.token_num>0: | |||||
| e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1) | |||||
| if not e: | |||||
| return get_data_error_result(retmsg="Document not found!") | |||||
| return get_json_result(data=True) | return get_json_result(data=True) | ||||
| except Exception as e: | except Exception as e: |
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | ||||
| from api.utils import get_uuid, get_format_time | from api.utils import get_uuid, get_format_time | ||||
| from api.db import StatusEnum, UserTenantRole | from api.db import StatusEnum, UserTenantRole | ||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.db.db_models import Knowledgebase | from api.db.db_models import Knowledgebase | ||||
| from api.settings import stat_logger, RetCode | from api.settings import stat_logger, RetCode | ||||
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result |
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | ||||
| from api.utils import get_uuid, get_format_time | from api.utils import get_uuid, get_format_time | ||||
| from api.db import StatusEnum, UserTenantRole | from api.db import StatusEnum, UserTenantRole | ||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.db.db_models import Knowledgebase, TenantLLM | from api.db.db_models import Knowledgebase, TenantLLM | ||||
| from api.settings import stat_logger, RetCode | from api.settings import stat_logger, RetCode | ||||
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result |
| CUSTOM = 'Custom' | CUSTOM = 'Custom' | ||||
| class TaskStatus(StrEnum): | |||||
| RUNNING = "1" | |||||
| CANCEL = "2" | |||||
| DONE = "3" | |||||
| FAIL = "4" | |||||
| class ParserType(StrEnum): | class ParserType(StrEnum): | ||||
| GENERAL = "general" | GENERAL = "general" | ||||
| PRESENTATION = "presentation" | PRESENTATION = "presentation" | ||||
| LAWS = "laws" | LAWS = "laws" | ||||
| MANUAL = "manual" | MANUAL = "manual" | ||||
| PAPER = "paper" | PAPER = "paper" | ||||
| RESUME = "" | |||||
| BOOK = "" | |||||
| QA = "" | |||||
| RESUME = "resume" | |||||
| BOOK = "book" | |||||
| QA = "qa" |
| DB.create_tables([model]) | DB.create_tables([model]) | ||||
| for data in data_source: | |||||
| current_time = current_timestamp() | |||||
| for i,data in enumerate(data_source): | |||||
| current_time = current_timestamp() + i | |||||
| current_date = timestamp_to_date(current_time) | current_date = timestamp_to_date(current_time) | ||||
| if 'create_time' not in data: | if 'create_time' not in data: | ||||
| data['create_time'] = current_time | data['create_time'] = current_time |
| # | # | ||||
| from peewee import Expression | from peewee import Expression | ||||
| from api.db import TenantPermission, FileType | |||||
| from api.db import TenantPermission, FileType, TaskStatus | |||||
| from api.db.db_models import DB, Knowledgebase, Tenant | from api.db.db_models import DB, Knowledgebase, Tenant | ||||
| from api.db.db_models import Document | from api.db.db_models import Document | ||||
| from api.db.services.common_service import CommonService | from api.db.services.common_service import CommonService | ||||
| from api.db.services.kb_service import KnowledgebaseService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.db import StatusEnum | from api.db import StatusEnum | ||||
| ~(cls.model.type == FileType.VIRTUAL.value), | ~(cls.model.type == FileType.VIRTUAL.value), | ||||
| cls.model.progress == 0, | cls.model.progress == 0, | ||||
| cls.model.update_time >= tm, | cls.model.update_time >= tm, | ||||
| cls.model.run == TaskStatus.RUNNING.value, | |||||
| (Expression(cls.model.create_time, "%%", comm) == mod))\ | (Expression(cls.model.create_time, "%%", comm) == mod))\ | ||||
| .order_by(cls.model.update_time.asc())\ | .order_by(cls.model.update_time.asc())\ | ||||
| .paginate(1, items_per_page) | .paginate(1, items_per_page) |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from api.db.db_models import Knowledgebase, Document | |||||
| from api.db import StatusEnum, TenantPermission | |||||
| from api.db.db_models import Knowledgebase, DB, Tenant | |||||
| from api.db.services.common_service import CommonService | from api.db.services.common_service import CommonService | ||||
| class KnowledgebaseService(CommonService): | class KnowledgebaseService(CommonService): | ||||
| model = Knowledgebase | model = Knowledgebase | ||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def get_by_tenant_ids(cls, joined_tenant_ids, user_id, | |||||
| page_number, items_per_page, orderby, desc): | |||||
| kbs = cls.model.select().where( | |||||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||||
| TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id)) | |||||
| & (cls.model.status == StatusEnum.VALID.value) | |||||
| ) | |||||
| if desc: | |||||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||||
| else: | |||||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||||
| class DocumentService(CommonService): | |||||
| model = Document | |||||
| kbs = kbs.paginate(page_number, items_per_page) | |||||
| return list(kbs.dicts()) | |||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def get_detail(cls, kb_id): | |||||
| fields = [ | |||||
| cls.model.id, | |||||
| Tenant.embd_id, | |||||
| cls.model.avatar, | |||||
| cls.model.name, | |||||
| cls.model.description, | |||||
| cls.model.permission, | |||||
| cls.model.doc_num, | |||||
| cls.model.token_num, | |||||
| cls.model.chunk_num, | |||||
| cls.model.parser_id] | |||||
| kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where( | |||||
| (cls.model.id == kb_id), | |||||
| (cls.model.status == StatusEnum.VALID.value) | |||||
| ) | |||||
| if not kbs: | |||||
| return | |||||
| d = kbs[0].to_dict() | |||||
| d["embd_id"] = kbs[0].tenant.embd_id | |||||
| return d |
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from peewee import Expression | |||||
| from api.db.db_models import DB | |||||
| from api.db import StatusEnum, FileType | |||||
| from api.db.db_models import Task, Document, Knowledgebase, Tenant | |||||
| from api.db.services.common_service import CommonService | |||||
| class TaskService(CommonService): | |||||
| model = Task | |||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64): | |||||
| fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time] | |||||
| docs = cls.model.select(*fields) \ | |||||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||||
| .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | |||||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |||||
| .where( | |||||
| Document.status == StatusEnum.VALID.value, | |||||
| ~(Document.type == FileType.VIRTUAL.value), | |||||
| cls.model.progress == 0, | |||||
| cls.model.update_time >= tm, | |||||
| (Expression(cls.model.create_time, "%%", comm) == mod))\ | |||||
| .order_by(cls.model.update_time.asc())\ | |||||
| .paginate(1, items_per_page) | |||||
| return list(docs.dicts()) | |||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def do_cancel(cls, id): | |||||
| try: | |||||
| cls.model.get_by_id(id) | |||||
| return False | |||||
| except Exception as e: | |||||
| pass | |||||
| return True | |||||
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from peewee import Expression | |||||
| from api.db.db_models import DB | |||||
| from api.db import StatusEnum, FileType, TaskStatus | |||||
| from api.db.db_models import Task, Document, Knowledgebase, Tenant | |||||
| from api.db.services.common_service import CommonService | |||||
| from api.db.services.document_service import DocumentService | |||||
| class TaskService(CommonService): | |||||
| model = Task | |||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64): | |||||
| fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time] | |||||
| docs = cls.model.select(*fields) \ | |||||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | |||||
| .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \ | |||||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |||||
| .where( | |||||
| Document.status == StatusEnum.VALID.value, | |||||
| ~(Document.type == FileType.VIRTUAL.value), | |||||
| cls.model.progress == 0, | |||||
| cls.model.update_time >= tm, | |||||
| (Expression(cls.model.create_time, "%%", comm) == mod))\ | |||||
| .order_by(cls.model.update_time.asc())\ | |||||
| .paginate(1, items_per_page) | |||||
| return list(docs.dicts()) | |||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def do_cancel(cls, id): | |||||
| try: | |||||
| task = cls.model.get_by_id(id) | |||||
| _, doc = DocumentService.get_by_id(task.doc_id) | |||||
| return doc.run == TaskStatus.CANCEL.value | |||||
| except Exception as e: | |||||
| pass | |||||
| return True |
| if re.match(r".*\.pdf$", filename): | if re.match(r".*\.pdf$", filename): | ||||
| return FileType.PDF.value | return FileType.PDF.value | ||||
| if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||||
| if re.match(r".*\.(docx|doc|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||||
| return FileType.DOC.value | return FileType.DOC.value | ||||
| if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): | if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): |
| from rag.nlp import stemmer, huqie | from rag.nlp import stemmer, huqie | ||||
| def callback__(progress, msg, func): | |||||
| if not func :return | |||||
| func(progress, msg) | |||||
| BULLET_PATTERN = [[ | BULLET_PATTERN = [[ | ||||
| r"第[零一二三四五六七八九十百]+编", | |||||
| r"第[零一二三四五六七八九十百]+(编|部分)", | |||||
| r"第[零一二三四五六七八九十百]+章", | r"第[零一二三四五六七八九十百]+章", | ||||
| r"第[零一二三四五六七八九十百]+节", | r"第[零一二三四五六七八九十百]+节", | ||||
| r"第[零一二三四五六七八九十百]+条", | r"第[零一二三四五六七八九十百]+条", | ||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | ||||
| r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}", | ||||
| ], [ | ], [ | ||||
| r"第[零一二三四五六七八九十百]+章", | |||||
| r"第[零一二三四五六七八九十百]+节", | |||||
| r"[零一二三四五六七八九十百]+[ 、]", | r"[零一二三四五六七八九十百]+[ 、]", | ||||
| r"[\((][零一二三四五六七八九十百]+[\))]", | r"[\((][零一二三四五六七八九十百]+[\))]", | ||||
| r"[\((][0-9]{,2}[\))]", | r"[\((][0-9]{,2}[\))]", | ||||
| def is_english(texts): | def is_english(texts): | ||||
| eng = 0 | eng = 0 | ||||
| for t in texts: | for t in texts: | ||||
| if re.match(r"[a-zA-Z]", t.strip()): | |||||
| if re.match(r"[a-zA-Z]{2,}", t.strip()): | |||||
| eng += 1 | eng += 1 | ||||
| if eng / len(texts) > 0.8: | if eng / len(texts) > 0.8: | ||||
| return True | return True | ||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | ||||
| def remove_contents_table(sections, eng=False): | |||||
| i = 0 | |||||
| while i < len(sections): | |||||
| def get(i): | |||||
| nonlocal sections | |||||
| return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip() | |||||
| if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)): | |||||
| i += 1 | |||||
| continue | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| while not prefix: | |||||
| sections.pop(i) | |||||
| if i >= len(sections): break | |||||
| prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2]) | |||||
| sections.pop(i) | |||||
| if i >= len(sections) or not prefix: break | |||||
| for j in range(i, min(i+128, len(sections))): | |||||
| if not re.match(prefix, get(j)): | |||||
| continue | |||||
| for _ in range(i, j):sections.pop(i) | |||||
| break |
| import copy | |||||
| import random | |||||
| import re | |||||
| from io import BytesIO | |||||
| from docx import Document | |||||
| import numpy as np | |||||
| from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table | |||||
| from rag.nlp import huqie | |||||
| from rag.parser.docx_parser import HuDocxParser | |||||
| from rag.parser.pdf_parser import HuParser | |||||
| class Pdf(HuParser): | |||||
| def __call__(self, filename, binary=None, from_page=0, | |||||
| to_page=100000, zoomin=3, callback=None): | |||||
| self.__images__( | |||||
| filename if not binary else binary, | |||||
| zoomin, | |||||
| from_page, | |||||
| to_page) | |||||
| callback(0.1, "OCR finished") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| self._layouts_paddle(zoomin) | |||||
| callback(0.47, "Layout analysis finished") | |||||
| print("paddle layouts:", timer() - start) | |||||
| self._table_transformer_job(zoomin) | |||||
| callback(0.68, "Table analysis finished") | |||||
| self._text_merge() | |||||
| column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | |||||
| self._concat_downward(concat_between_pages=False) | |||||
| self._filter_forpages() | |||||
| self._merge_with_same_bullet() | |||||
| callback(0.75, "Text merging finished.") | |||||
| tbls = self._extract_table_figure(True, zoomin, False) | |||||
| callback(0.8, "Text extraction finished") | |||||
| return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes] | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): | |||||
| doc = { | |||||
| "docnm_kwd": filename, | |||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | |||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| pdf_parser = None | |||||
| sections,tbls = [], [] | |||||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| doc_parser = HuDocxParser() | |||||
| # TODO: table of contents need to be removed | |||||
| sections, tbls = doc_parser(binary if binary else filename) | |||||
| remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200))) | |||||
| callback(0.8, "Finish parsing.") | |||||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||||
| pdf_parser = Pdf() | |||||
| sections,tbls = pdf_parser(filename if not binary else binary, | |||||
| from_page=from_page, to_page=to_page, callback=callback) | |||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| txt = "" | |||||
| if binary:txt = binary.decode("utf-8") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l:break | |||||
| txt += l | |||||
| sections = txt.split("\n") | |||||
| sections = [(l,"") for l in sections if l] | |||||
| remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200))) | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | |||||
| bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)]) | |||||
| projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections) | |||||
| levels = [[]] * len(BULLET_PATTERN[bull]) + 2 | |||||
| for i, (txt, layout) in enumerate(sections): | |||||
| for j, p in enumerate(BULLET_PATTERN[bull]): | |||||
| if re.match(p, txt.strip()): | |||||
| projs[i] = j | |||||
| levels[j].append(i) | |||||
| break | |||||
| else: | |||||
| if re.search(r"(title|head)", layout): | |||||
| projs[i] = BULLET_PATTERN[bull] | |||||
| levels[BULLET_PATTERN[bull]].append(i) | |||||
| else: | |||||
| levels[BULLET_PATTERN[bull] + 1].append(i) | |||||
| sections = [t for t,_ in sections] | |||||
| def binary_search(arr, target): | |||||
| if target > arr[-1]: return len(arr) - 1 | |||||
| if target > arr[0]: return -1 | |||||
| s, e = 0, len(arr) | |||||
| while e - s > 1: | |||||
| i = (e + s) // 2 | |||||
| if target > arr[i]: | |||||
| s = i | |||||
| continue | |||||
| elif target < arr[i]: | |||||
| e = i | |||||
| continue | |||||
| else: | |||||
| assert False | |||||
| return s | |||||
| cks = [] | |||||
| readed = [False] * len(sections) | |||||
| levels = levels[::-1] | |||||
| for i, arr in enumerate(levels): | |||||
| for j in arr: | |||||
| if readed[j]: continue | |||||
| readed[j] = True | |||||
| cks.append([j]) | |||||
| if i + 1 == len(levels) - 1: continue | |||||
| for ii in range(i + 1, len(levels)): | |||||
| jj = binary_search(levels[ii], j) | |||||
| if jj < 0: break | |||||
| if jj > cks[-1][-1]: cks[-1].pop(-1) | |||||
| cks[-1].append(levels[ii][jj]) | |||||
| # is it English | |||||
| eng = is_english(random.choices(sections, k=218)) | |||||
| res = [] | |||||
| # add tables | |||||
| for img, rows in tbls: | |||||
| bs = 10 | |||||
| de = ";" if eng else ";" | |||||
| for i in range(0, len(rows), bs): | |||||
| d = copy.deepcopy(doc) | |||||
| r = de.join(rows[i:i + bs]) | |||||
| r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r) | |||||
| tokenize(d, r, eng) | |||||
| d["image"] = img | |||||
| res.append(d) | |||||
| # wrap up to es documents | |||||
| for ck in cks: | |||||
| print("\n-".join(ck[::-1])) | |||||
| ck = "\n".join(ck[::-1]) | |||||
| d = copy.deepcopy(doc) | |||||
| if pdf_parser: | |||||
| d["image"] = pdf_parser.crop(ck) | |||||
| ck = pdf_parser.remove_tag(ck) | |||||
| tokenize(d, ck, eng) | |||||
| res.append(d) | |||||
| return res | |||||
| if __name__ == "__main__": | |||||
| import sys | |||||
| chunk(sys.argv[1]) |
| from io import BytesIO | from io import BytesIO | ||||
| from docx import Document | from docx import Document | ||||
| import numpy as np | import numpy as np | ||||
| from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize | |||||
| from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize | |||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from rag.parser.docx_parser import HuDocxParser | from rag.parser.docx_parser import HuDocxParser | ||||
| from rag.parser.pdf_parser import HuParser | from rag.parser.pdf_parser import HuParser | ||||
| zoomin, | zoomin, | ||||
| from_page, | from_page, | ||||
| to_page) | to_page) | ||||
| callback__(0.1, "OCR finished", callback) | |||||
| callback(0.1, "OCR finished") | |||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| start = timer() | start = timer() | ||||
| self._layouts_paddle(zoomin) | self._layouts_paddle(zoomin) | ||||
| callback__(0.77, "Layout analysis finished", callback) | |||||
| callback(0.77, "Layout analysis finished") | |||||
| print("paddle layouts:", timer()-start) | print("paddle layouts:", timer()-start) | ||||
| bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) | bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3) | ||||
| # is it English | # is it English | ||||
| b["x1"] = max(b["x1"], b_["x1"]) | b["x1"] = max(b["x1"], b_["x1"]) | ||||
| bxs.pop(i + 1) | bxs.pop(i + 1) | ||||
| callback__(0.8, "Text extraction finished", callback) | |||||
| callback(0.8, "Text extraction finished") | |||||
| return [b["text"] + self._line_tag(b, zoomin) for b in bxs] | return [b["text"] + self._line_tag(b, zoomin) for b in bxs] | ||||
| pdf_parser = None | pdf_parser = None | ||||
| sections = [] | sections = [] | ||||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | if re.search(r"\.docx?$", filename, re.IGNORECASE): | ||||
| callback__(0.1, "Start to parse.", callback) | |||||
| callback(0.1, "Start to parse.") | |||||
| for txt in Docx()(filename, binary): | for txt in Docx()(filename, binary): | ||||
| sections.append(txt) | sections.append(txt) | ||||
| callback__(0.8, "Finish parsing.", callback) | |||||
| callback(0.8, "Finish parsing.") | |||||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | elif re.search(r"\.pdf$", filename, re.IGNORECASE): | ||||
| pdf_parser = Pdf() | pdf_parser = Pdf() | ||||
| for txt in pdf_parser(filename if not binary else binary, | for txt in pdf_parser(filename if not binary else binary, | ||||
| from_page=from_page, to_page=to_page, callback=callback): | from_page=from_page, to_page=to_page, callback=callback): | ||||
| sections.append(txt) | sections.append(txt) | ||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | elif re.search(r"\.txt$", filename, re.IGNORECASE): | ||||
| callback__(0.1, "Start to parse.", callback) | |||||
| callback(0.1, "Start to parse.") | |||||
| txt = "" | txt = "" | ||||
| if binary:txt = binary.decode("utf-8") | if binary:txt = binary.decode("utf-8") | ||||
| else: | else: | ||||
| txt += l | txt += l | ||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [l for l in sections if l] | sections = [l for l in sections if l] | ||||
| callback__(0.8, "Finish parsing.", callback) | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") | ||||
| # is it English | # is it English | ||||
| # Remove 'Contents' part | # Remove 'Contents' part | ||||
| i = 0 | i = 0 | ||||
| while i < len(sections): | while i < len(sections): | ||||
| if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])): | |||||
| if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)): | |||||
| i += 1 | i += 1 | ||||
| continue | continue | ||||
| sections.pop(i) | sections.pop(i) | ||||
| for j in range(i, min(i+128, len(sections))): | for j in range(i, min(i+128, len(sections))): | ||||
| if not re.match(prefix, sections[j]): | if not re.match(prefix, sections[j]): | ||||
| continue | continue | ||||
| for k in range(i, j):sections.pop(i) | |||||
| for _ in range(i, j):sections.pop(i) | |||||
| break | break | ||||
| bull = bullets_category(sections) | bull = bullets_category(sections) |
| import copy | import copy | ||||
| import re | import re | ||||
| from rag.app import callback__, tokenize | |||||
| from rag.app import tokenize | |||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from rag.parser.pdf_parser import HuParser | from rag.parser.pdf_parser import HuParser | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| zoomin, | zoomin, | ||||
| from_page, | from_page, | ||||
| to_page) | to_page) | ||||
| callback__(0.2, "OCR finished.", callback) | |||||
| callback(0.2, "OCR finished.") | |||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| start = timer() | start = timer() | ||||
| self._layouts_paddle(zoomin) | self._layouts_paddle(zoomin) | ||||
| callback__(0.5, "Layout analysis finished.", callback) | |||||
| callback(0.5, "Layout analysis finished.") | |||||
| print("paddle layouts:", timer() - start) | print("paddle layouts:", timer() - start) | ||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback__(0.7, "Table analysis finished.", callback) | |||||
| callback(0.7, "Table analysis finished.") | |||||
| self._text_merge() | self._text_merge() | ||||
| self._concat_downward(concat_between_pages=False) | self._concat_downward(concat_between_pages=False) | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback__(0.77, "Text merging finished", callback) | |||||
| callback(0.77, "Text merging finished") | |||||
| tbls = self._extract_table_figure(True, zoomin, False) | tbls = self._extract_table_figure(True, zoomin, False) | ||||
| # clean mess | # clean mess | ||||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | ||||
| # merge chunks with the same bullets | # merge chunks with the same bullets | ||||
| i = 0 | |||||
| while i + 1 < len(self.boxes): | |||||
| b = self.boxes[i] | |||||
| b_ = self.boxes[i + 1] | |||||
| if b["text"].strip()[0] != b_["text"].strip()[0] \ | |||||
| or b["page_number"]!=b_["page_number"] \ | |||||
| or b["top"] > b_["bottom"]: | |||||
| i += 1 | |||||
| continue | |||||
| b_["text"] = b["text"] + "\n" + b_["text"] | |||||
| b_["x0"] = min(b["x0"], b_["x0"]) | |||||
| b_["x1"] = max(b["x1"], b_["x1"]) | |||||
| b_["top"] = b["top"] | |||||
| self.boxes.pop(i) | |||||
| self._merge_with_same_bullet() | |||||
| # merge title with decent chunk | # merge title with decent chunk | ||||
| i = 0 | i = 0 | ||||
| while i + 1 < len(self.boxes): | while i + 1 < len(self.boxes): | ||||
| b_["top"] = b["top"] | b_["top"] = b["top"] | ||||
| self.boxes.pop(i) | self.boxes.pop(i) | ||||
| callback__(0.8, "Parsing finished", callback) | |||||
| callback(0.8, "Parsing finished") | |||||
| for b in self.boxes: print(b["text"], b.get("layoutno")) | for b in self.boxes: print(b["text"], b.get("layoutno")) | ||||
| print(tbls) | print(tbls) |
| import copy | import copy | ||||
| import re | import re | ||||
| from collections import Counter | from collections import Counter | ||||
| from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize | |||||
| from rag.nlp import huqie, stemmer | |||||
| from rag.parser.docx_parser import HuDocxParser | |||||
| from rag.app import tokenize | |||||
| from rag.nlp import huqie | |||||
| from rag.parser.pdf_parser import HuParser | from rag.parser.pdf_parser import HuParser | ||||
| from nltk.tokenize import word_tokenize | |||||
| import numpy as np | import numpy as np | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| zoomin, | zoomin, | ||||
| from_page, | from_page, | ||||
| to_page) | to_page) | ||||
| callback__(0.2, "OCR finished.", callback) | |||||
| callback(0.2, "OCR finished.") | |||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| start = timer() | start = timer() | ||||
| self._layouts_paddle(zoomin) | self._layouts_paddle(zoomin) | ||||
| callback__(0.47, "Layout analysis finished", callback) | |||||
| callback(0.47, "Layout analysis finished") | |||||
| print("paddle layouts:", timer() - start) | print("paddle layouts:", timer() - start) | ||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback__(0.68, "Table analysis finished", callback) | |||||
| callback(0.68, "Table analysis finished") | |||||
| self._text_merge() | self._text_merge() | ||||
| column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | ||||
| self._concat_downward(concat_between_pages=False) | self._concat_downward(concat_between_pages=False) | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback__(0.75, "Text merging finished.", callback) | |||||
| callback(0.75, "Text merging finished.") | |||||
| tbls = self._extract_table_figure(True, zoomin, False) | tbls = self._extract_table_figure(True, zoomin, False) | ||||
| # clean mess | # clean mess | ||||
| break | break | ||||
| if not abstr: i = 0 | if not abstr: i = 0 | ||||
| callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback) | |||||
| callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page))) | |||||
| for b in self.boxes: print(b["text"], b.get("layoutno")) | for b in self.boxes: print(b["text"], b.get("layoutno")) | ||||
| print(tbls) | print(tbls) | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from pptx import Presentation | from pptx import Presentation | ||||
| from rag.app import callback__, tokenize, is_english | |||||
| from rag.app import tokenize, is_english | |||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from rag.parser.pdf_parser import HuParser | from rag.parser.pdf_parser import HuParser | ||||
| if txt: texts.append(txt) | if txt: texts.append(txt) | ||||
| txts.append("\n".join(texts)) | txts.append("\n".join(texts)) | ||||
| callback__(0.5, "Text extraction finished.", callback) | |||||
| callback(0.5, "Text extraction finished.") | |||||
| import aspose.slides as slides | import aspose.slides as slides | ||||
| import aspose.pydrawing as drawing | import aspose.pydrawing as drawing | ||||
| imgs = [] | imgs = [] | ||||
| slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) | slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) | ||||
| imgs.append(buffered.getvalue()) | imgs.append(buffered.getvalue()) | ||||
| assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | ||||
| callback__(0.9, "Image extraction finished", callback) | |||||
| callback(0.9, "Image extraction finished") | |||||
| self.is_english = is_english(txts) | self.is_english = is_english(txts) | ||||
| return [(txts[i], imgs[i]) for i in range(len(txts))] | return [(txts[i], imgs[i]) for i in range(len(txts))] | ||||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): | def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): | ||||
| self.__images__(filename if not binary else binary, zoomin, from_page, to_page) | self.__images__(filename if not binary else binary, zoomin, from_page, to_page) | ||||
| callback__(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback) | |||||
| callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page))) | |||||
| assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) | assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images)) | ||||
| res = [] | res = [] | ||||
| #################### More precisely ################### | #################### More precisely ################### | ||||
| for i in range(len(self.boxes)): | for i in range(len(self.boxes)): | ||||
| lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])]) | lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])]) | ||||
| res.append((lines, self.page_images[i])) | res.append((lines, self.page_images[i])) | ||||
| callback__(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)), callback) | |||||
| callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page))) | |||||
| return res | return res | ||||
| import random | |||||
| import re | |||||
| from io import BytesIO | |||||
| from nltk import word_tokenize | |||||
| from openpyxl import load_workbook | |||||
| from rag.app import is_english | |||||
| from rag.nlp import huqie, stemmer | |||||
| class Excel(object): | |||||
| def __call__(self, fnm, binary=None, callback=None): | |||||
| if not binary: | |||||
| wb = load_workbook(fnm) | |||||
| else: | |||||
| wb = load_workbook(BytesIO(binary)) | |||||
| total = 0 | |||||
| for sheetname in wb.sheetnames: | |||||
| total += len(list(wb[sheetname].rows)) | |||||
| res, fails = [], [] | |||||
| for sheetname in wb.sheetnames: | |||||
| ws = wb[sheetname] | |||||
| rows = list(ws.rows) | |||||
| for i, r in enumerate(rows): | |||||
| q, a = "", "" | |||||
| for cell in r: | |||||
| if not cell.value: continue | |||||
| if not q: q = str(cell.value) | |||||
| elif not a: a = str(cell.value) | |||||
| else: break | |||||
| if q and a: res.append((q, a)) | |||||
| else: fails.append(str(i+1)) | |||||
| if len(res) % 999 == 0: | |||||
| callback(len(res)*0.6/total, ("Extract Q&A: {}".format(len(res)) + (f"{len(fails)} failure, line: %s..."%(",".join(fails[:3])) if fails else ""))) | |||||
| callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( | |||||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||||
| self.is_english = is_english([rmPrefix(q) for q, _ in random.choices(res, k=30) if len(q)>1]) | |||||
| return res | |||||
| def rmPrefix(txt): | |||||
| return re.sub(r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) | |||||
| def beAdoc(d, q, a, eng): | |||||
| qprefix = "Question: " if eng else "问题:" | |||||
| aprefix = "Answer: " if eng else "回答:" | |||||
| d["content_with_weight"] = "\t".join([qprefix+rmPrefix(q), aprefix+rmPrefix(a)]) | |||||
| if eng: | |||||
| d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(q)]) | |||||
| else: | |||||
| d["content_ltks"] = huqie.qie(q) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| return d | |||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None): | |||||
| res = [] | |||||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| excel_parser = Excel() | |||||
| for q,a in excel_parser(filename, binary, callback): | |||||
| res.append(beAdoc({}, q, a, excel_parser.is_english)) | |||||
| return res | |||||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| txt = "" | |||||
| if binary: | |||||
| txt = binary.decode("utf-8") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: break | |||||
| txt += l | |||||
| lines = txt.split("\n") | |||||
| eng = is_english([rmPrefix(l) for l in lines[:100]]) | |||||
| fails = [] | |||||
| for i, line in enumerate(lines): | |||||
| arr = [l for l in line.split("\t") if len(l) > 1] | |||||
| if len(arr) != 2: | |||||
| fails.append(str(i)) | |||||
| continue | |||||
| res.append(beAdoc({}, arr[0], arr[1], eng)) | |||||
| if len(res) % 999 == 0: | |||||
| callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( | |||||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||||
| callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( | |||||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||||
| return res | |||||
| raise NotImplementedError("file type not supported yet(pptx, pdf supported)") | |||||
| if __name__== "__main__": | |||||
| import sys | |||||
| def kk(rat, ss): | |||||
| pass | |||||
| print(chunk(sys.argv[1], callback=kk)) | |||||
| return | return | ||||
| i = 0 | i = 0 | ||||
| while i < len(self.boxes): | while i < len(self.boxes): | ||||
| if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): | |||||
| if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): | |||||
| i += 1 | i += 1 | ||||
| continue | continue | ||||
| eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) | eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) | ||||
| for k in range(i, j): self.boxes.pop(i) | for k in range(i, j): self.boxes.pop(i) | ||||
| break | break | ||||
| def _merge_with_same_bullet(self): | |||||
| i = 0 | |||||
| while i + 1 < len(self.boxes): | |||||
| b = self.boxes[i] | |||||
| b_ = self.boxes[i + 1] | |||||
| if b["text"].strip()[0] != b_["text"].strip()[0] \ | |||||
| or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ | |||||
| or b["top"] > b_["bottom"]: | |||||
| i += 1 | |||||
| continue | |||||
| b_["text"] = b["text"] + "\n" + b_["text"] | |||||
| b_["x0"] = min(b["x0"], b_["x0"]) | |||||
| b_["x1"] = max(b["x1"], b_["x1"]) | |||||
| b_["top"] = b["top"] | |||||
| self.boxes.pop(i) | |||||
| def _blockType(self, b): | def _blockType(self, b): | ||||
| patt = [ | patt = [ | ||||
| ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), | ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), |
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import logging | |||||
| import os | |||||
| import time | |||||
| import random | |||||
| from timeit import default_timer as timer | |||||
| from api.db.db_models import Task | |||||
| from api.db.db_utils import bulk_insert_into_db | |||||
| from api.db.services.task_service import TaskService | |||||
| from rag.parser.pdf_parser import HuParser | |||||
| from rag.settings import cron_logger | |||||
| from rag.utils import MINIO | |||||
| from rag.utils import findMaxTm | |||||
| import pandas as pd | |||||
| from api.db import FileType | |||||
| from api.db.services.document_service import DocumentService | |||||
| from api.settings import database_logger | |||||
| from api.utils import get_format_time, get_uuid | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| def collect(tm): | |||||
| docs = DocumentService.get_newly_uploaded(tm) | |||||
| if len(docs) == 0: | |||||
| return pd.DataFrame() | |||||
| docs = pd.DataFrame(docs) | |||||
| mtm = docs["update_time"].max() | |||||
| cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm)) | |||||
| return docs | |||||
| def set_dispatching(docid): | |||||
| try: | |||||
| DocumentService.update_by_id( | |||||
| docid, {"progress": random.randint(0, 3) / 100., | |||||
| "progress_msg": "Task dispatched...", | |||||
| "process_begin_at": get_format_time() | |||||
| }) | |||||
| except Exception as e: | |||||
| cron_logger.error("set_dispatching:({}), {}".format(docid, str(e))) | |||||
| def dispatch(): | |||||
| tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm") | |||||
| tm = findMaxTm(tm_fnm) | |||||
| rows = collect(tm) | |||||
| if len(rows) == 0: | |||||
| return | |||||
| tmf = open(tm_fnm, "a+") | |||||
| for _, r in rows.iterrows(): | |||||
| try: | |||||
| tsks = TaskService.query(doc_id=r["id"]) | |||||
| if tsks: | |||||
| for t in tsks: | |||||
| TaskService.delete_by_id(t.id) | |||||
| except Exception as e: | |||||
| cron_logger.error("delete task exception:" + str(e)) | |||||
| def new_task(): | |||||
| nonlocal r | |||||
| return { | |||||
| "id": get_uuid(), | |||||
| "doc_id": r["id"] | |||||
| } | |||||
| tsks = [] | |||||
| if r["type"] == FileType.PDF.value: | |||||
| pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | |||||
| for p in range(0, pages, 10): | |||||
| task = new_task() | |||||
| task["from_page"] = p | |||||
| task["to_page"] = min(p + 10, pages) | |||||
| tsks.append(task) | |||||
| else: | |||||
| tsks.append(new_task()) | |||||
| print(tsks) | |||||
| bulk_insert_into_db(Task, tsks, True) | |||||
| set_dispatching(r["id"]) | |||||
| tmf.write(str(r["update_time"]) + "\n") | |||||
| tmf.close() | |||||
| def update_progress(): | |||||
| docs = DocumentService.get_unfinished_docs() | |||||
| for d in docs: | |||||
| try: | |||||
| tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time) | |||||
| if not tsks:continue | |||||
| msg = [] | |||||
| prg = 0 | |||||
| finished = True | |||||
| bad = 0 | |||||
| for t in tsks: | |||||
| if 0 <= t.progress < 1: finished = False | |||||
| prg += t.progress if t.progress >= 0 else 0 | |||||
| msg.append(t.progress_msg) | |||||
| if t.progress == -1: bad += 1 | |||||
| prg /= len(tsks) | |||||
| if finished and bad: prg = -1 | |||||
| msg = "\n".join(msg) | |||||
| DocumentService.update_by_id(d["id"], {"progress": prg, "progress_msg": msg, "process_duation": timer()-d["process_begin_at"].timestamp()}) | |||||
| except Exception as e: | |||||
| cron_logger.error("fetch task exception:" + str(e)) | |||||
| if __name__ == "__main__": | |||||
| peewee_logger = logging.getLogger('peewee') | |||||
| peewee_logger.propagate = False | |||||
| peewee_logger.addHandler(database_logger.handlers[0]) | |||||
| peewee_logger.setLevel(database_logger.level) | |||||
| while True: | |||||
| dispatch() | |||||
| time.sleep(3) | |||||
| update_progress() | |||||
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import logging | |||||
| import os | |||||
| import time | |||||
| import random | |||||
| from datetime import datetime | |||||
| from api.db.db_models import Task | |||||
| from api.db.db_utils import bulk_insert_into_db | |||||
| from api.db.services.task_service import TaskService | |||||
| from rag.parser.pdf_parser import HuParser | |||||
| from rag.settings import cron_logger | |||||
| from rag.utils import MINIO | |||||
| from rag.utils import findMaxTm | |||||
| import pandas as pd | |||||
| from api.db import FileType, TaskStatus | |||||
| from api.db.services.document_service import DocumentService | |||||
| from api.settings import database_logger | |||||
| from api.utils import get_format_time, get_uuid | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| def collect(tm): | |||||
| docs = DocumentService.get_newly_uploaded(tm) | |||||
| if len(docs) == 0: | |||||
| return pd.DataFrame() | |||||
| docs = pd.DataFrame(docs) | |||||
| mtm = docs["update_time"].max() | |||||
| cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm)) | |||||
| return docs | |||||
| def set_dispatching(docid): | |||||
| try: | |||||
| DocumentService.update_by_id( | |||||
| docid, {"progress": random.randint(0, 3) / 100., | |||||
| "progress_msg": "Task dispatched...", | |||||
| "process_begin_at": get_format_time() | |||||
| }) | |||||
| except Exception as e: | |||||
| cron_logger.error("set_dispatching:({}), {}".format(docid, str(e))) | |||||
| def dispatch(): | |||||
| tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm") | |||||
| tm = findMaxTm(tm_fnm) | |||||
| rows = collect(tm) | |||||
| if len(rows) == 0: | |||||
| return | |||||
| tmf = open(tm_fnm, "a+") | |||||
| for _, r in rows.iterrows(): | |||||
| try: | |||||
| tsks = TaskService.query(doc_id=r["id"]) | |||||
| if tsks: | |||||
| for t in tsks: | |||||
| TaskService.delete_by_id(t.id) | |||||
| except Exception as e: | |||||
| cron_logger.error("delete task exception:" + str(e)) | |||||
| def new_task(): | |||||
| nonlocal r | |||||
| return { | |||||
| "id": get_uuid(), | |||||
| "doc_id": r["id"] | |||||
| } | |||||
| tsks = [] | |||||
| if r["type"] == FileType.PDF.value: | |||||
| pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | |||||
| for p in range(0, pages, 10): | |||||
| task = new_task() | |||||
| task["from_page"] = p | |||||
| task["to_page"] = min(p + 10, pages) | |||||
| tsks.append(task) | |||||
| else: | |||||
| tsks.append(new_task()) | |||||
| print(tsks) | |||||
| bulk_insert_into_db(Task, tsks, True) | |||||
| set_dispatching(r["id"]) | |||||
| tmf.write(str(r["update_time"]) + "\n") | |||||
| tmf.close() | |||||
| def update_progress(): | |||||
| docs = DocumentService.get_unfinished_docs() | |||||
| for d in docs: | |||||
| try: | |||||
| tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time) | |||||
| if not tsks:continue | |||||
| msg = [] | |||||
| prg = 0 | |||||
| finished = True | |||||
| bad = 0 | |||||
| status = TaskStatus.RUNNING.value | |||||
| for t in tsks: | |||||
| if 0 <= t.progress < 1: finished = False | |||||
| prg += t.progress if t.progress >= 0 else 0 | |||||
| msg.append(t.progress_msg) | |||||
| if t.progress == -1: bad += 1 | |||||
| prg /= len(tsks) | |||||
| if finished and bad: | |||||
| prg = -1 | |||||
| status = TaskStatus.FAIL.value | |||||
| elif finished: status = TaskStatus.DONE.value | |||||
| msg = "\n".join(msg) | |||||
| info = {"process_duation": datetime.timestamp(datetime.now())-d["process_begin_at"].timestamp(), "run": status} | |||||
| if prg !=0 : info["progress"] = prg | |||||
| if msg: info["progress_msg"] = msg | |||||
| DocumentService.update_by_id(d["id"], info) | |||||
| except Exception as e: | |||||
| cron_logger.error("fetch task exception:" + str(e)) | |||||
| if __name__ == "__main__": | |||||
| peewee_logger = logging.getLogger('peewee') | |||||
| peewee_logger.propagate = False | |||||
| peewee_logger.addHandler(database_logger.handlers[0]) | |||||
| peewee_logger.setLevel(database_logger.level) | |||||
| while True: | |||||
| dispatch() | |||||
| time.sleep(3) | |||||
| update_progress() |
| from functools import partial | from functools import partial | ||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| from elasticsearch_dsl import Q | |||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from rag.llm import EmbeddingModel, CvModel | |||||
| from rag.settings import cron_logger, DOC_MAXIMUM_SIZE | from rag.settings import cron_logger, DOC_MAXIMUM_SIZE | ||||
| from rag.utils import ELASTICSEARCH | from rag.utils import ELASTICSEARCH | ||||
| from rag.utils import MINIO | from rag.utils import MINIO | ||||
| from io import BytesIO | from io import BytesIO | ||||
| import pandas as pd | import pandas as pd | ||||
| from rag.app import laws, paper, presentation, manual | |||||
| from rag.app import laws, paper, presentation, manual, qa | |||||
| from api.db import LLMType, ParserType | from api.db import LLMType, ParserType | ||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| ParserType.PRESENTATION.value: presentation, | ParserType.PRESENTATION.value: presentation, | ||||
| ParserType.MANUAL.value: manual, | ParserType.MANUAL.value: manual, | ||||
| ParserType.LAWS.value: laws, | ParserType.LAWS.value: laws, | ||||
| ParserType.QA.value: qa, | |||||
| } | } | ||||
| def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."): | def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."): | ||||
| cancel = TaskService.do_cancel(task_id) | cancel = TaskService.do_cancel(task_id) | ||||
| if cancel: | if cancel: | ||||
| msg = "Canceled." | |||||
| msg += " [Canceled]" | |||||
| prog = -1 | prog = -1 | ||||
| if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg | if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg | ||||
| def embedding(docs, mdl): | def embedding(docs, mdl): | ||||
| tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs] | |||||
| tts, cnts = [d["docnm_kwd"] for d in docs if d.get("docnm_kwd")], [d["content_with_weight"] for d in docs] | |||||
| tk_count = 0 | tk_count = 0 | ||||
| tts, c = mdl.encode(tts) | |||||
| tk_count += c | |||||
| if len(tts) == len(cnts): | |||||
| tts, c = mdl.encode(tts) | |||||
| tk_count += c | |||||
| cnts, c = mdl.encode(cnts) | cnts, c = mdl.encode(cnts) | ||||
| tk_count += c | tk_count += c | ||||
| vects = 0.1 * tts + 0.9 * cnts | |||||
| vects = (0.1 * tts + 0.9 * cnts) if len(tts) == len(cnts) else cnts | |||||
| assert len(vects) == len(docs) | assert len(vects) == len(docs) | ||||
| for i, d in enumerate(docs): | for i, d in enumerate(docs): | ||||
| v = vects[i].tolist() | v = vects[i].tolist() | ||||
| callback(msg="Finished embedding! Start to build index!") | callback(msg="Finished embedding! Start to build index!") | ||||
| init_kb(r) | init_kb(r) | ||||
| chunk_count = len(set([c["_id"] for c in cks])) | chunk_count = len(set([c["_id"] for c in cks])) | ||||
| callback(1., "Done!") | |||||
| es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) | es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"])) | ||||
| if es_r: | if es_r: | ||||
| callback(-1, "Index failure!") | callback(-1, "Index failure!") | ||||
| cron_logger.error(str(es_r)) | cron_logger.error(str(es_r)) | ||||
| else: | else: | ||||
| if TaskService.do_cancel(r["id"]): | |||||
| ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"])) | |||||
| callback(1., "Done!") | |||||
| DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | ||||
| cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks))) | cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks))) | ||||