### What problem does this PR solve? ### Type of change - [x] Refactoringtags/v0.5.0
| from itsdangerous import URLSafeTimedSerializer | from itsdangerous import URLSafeTimedSerializer | ||||
| from api.utils.file_utils import filename_type, thumbnail | from api.utils.file_utils import filename_type, thumbnail | ||||
| from rag.utils import MINIO | |||||
| from rag.utils.minio_conn import MINIO | |||||
| def generate_confirmation_token(tenent_id): | def generate_confirmation_token(tenent_id): |
| from rag.app.qa import rmPrefix, beAdoc | from rag.app.qa import rmPrefix, beAdoc | ||||
| from rag.nlp import search, huqie | from rag.nlp import search, huqie | ||||
| from rag.utils import ELASTICSEARCH, rmSpace | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from rag.utils import rmSpace | |||||
| from api.db import LLMType, ParserType | from api.db import LLMType, ParserType | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from api.db.services.llm_service import TenantLLMService | from api.db.services.llm_service import TenantLLMService |
| from api.db.services.file2document_service import File2DocumentService | from api.db.services.file2document_service import File2DocumentService | ||||
| from api.db.services.file_service import FileService | from api.db.services.file_service import FileService | ||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from api.db.services import duplicate_name | from api.db.services import duplicate_name | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request |
| from api.settings import RetCode | from api.settings import RetCode | ||||
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result | ||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| @manager.route('/convert', methods=['POST']) | @manager.route('/convert', methods=['POST']) |
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result | ||||
| from api.utils.file_utils import filename_type | from api.utils.file_utils import filename_type | ||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from rag.utils.minio_conn import MINIO | from rag.utils.minio_conn import MINIO | ||||
| from api.settings import stat_logger, RetCode | from api.settings import stat_logger, RetCode | ||||
| from api.utils.api_utils import get_json_result | from api.utils.api_utils import get_json_result | ||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| @manager.route('/create', methods=['post']) | @manager.route('/create', methods=['post']) |
| from peewee import Expression | from peewee import Expression | ||||
| from elasticsearch_dsl import Q | from elasticsearch_dsl import Q | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from rag.utils.minio_conn import MINIO | from rag.utils.minio_conn import MINIO | ||||
| from rag.nlp import search | from rag.nlp import search | ||||
| database_logger = getLogger("database") | database_logger = getLogger("database") | ||||
| chat_logger = getLogger("chat") | chat_logger = getLogger("chat") | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from api.utils import get_base_config, decrypt_database_config | from api.utils import get_base_config, decrypt_database_config | ||||
| from .pdf_parser import HuParser as PdfParser, PlainParser | |||||
| from .docx_parser import HuDocxParser as DocxParser | |||||
| from .excel_parser import HuExcelParser as ExcelParser | |||||
| from .ppt_parser import HuPptParser as PptParser | |||||
| from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser | |||||
| from .docx_parser import RAGFlowDocxParser as DocxParser | |||||
| from .excel_parser import RAGFlowExcelParser as ExcelParser | |||||
| from .ppt_parser import RAGFlowPptParser as PptParser |
| from io import BytesIO | from io import BytesIO | ||||
| class HuDocxParser: | |||||
| class RAGFlowDocxParser: | |||||
| def __extract_table_content(self, tb): | def __extract_table_content(self, tb): | ||||
| df = [] | df = [] |
| from rag.nlp import find_codec | from rag.nlp import find_codec | ||||
| class HuExcelParser: | |||||
| class RAGFlowExcelParser: | |||||
| def html(self, fnm): | def html(self, fnm): | ||||
| if isinstance(fnm, str): | if isinstance(fnm, str): | ||||
| wb = load_workbook(fnm) | wb = load_workbook(fnm) | ||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| psr = HuExcelParser() | |||||
| psr = RAGFlowExcelParser() | |||||
| psr(sys.argv[1]) | psr(sys.argv[1]) |
| logging.getLogger("pdfminer").setLevel(logging.WARNING) | logging.getLogger("pdfminer").setLevel(logging.WARNING) | ||||
| class HuParser: | |||||
| class RAGFlowPdfParser: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.ocr = OCR() | self.ocr = OCR() | ||||
| if hasattr(self, "model_speciess"): | if hasattr(self, "model_speciess"): |
| from pptx import Presentation | from pptx import Presentation | ||||
| class HuPptParser(object): | |||||
| class RAGFlowPptParser(object): | |||||
| def __init__(self): | def __init__(self): | ||||
| super().__init__() | super().__init__() | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from deepdoc.vision.seeit import draw_box | |||||
| from deepdoc.vision import OCR, init_in_out | |||||
| import argparse | |||||
| import numpy as np | |||||
| import os | import os | ||||
| import sys | import sys | ||||
| sys.path.insert( | sys.path.insert( | ||||
| os.path.abspath(__file__)), | os.path.abspath(__file__)), | ||||
| '../../'))) | '../../'))) | ||||
| from deepdoc.vision.seeit import draw_box | |||||
| from deepdoc.vision import OCR, init_in_out | |||||
| import argparse | |||||
| import numpy as np | |||||
| def main(args): | def main(args): | ||||
| ocr = OCR() | ocr = OCR() |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from deepdoc.vision.seeit import draw_box | |||||
| from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| import argparse | |||||
| import os | |||||
| import sys | |||||
| import re | |||||
| import numpy as np | |||||
| import os, sys | |||||
| sys.path.insert( | sys.path.insert( | ||||
| 0, | 0, | ||||
| os.path.abspath( | os.path.abspath( | ||||
| os.path.abspath(__file__)), | os.path.abspath(__file__)), | ||||
| '../../'))) | '../../'))) | ||||
| from deepdoc.vision.seeit import draw_box | |||||
| from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| import argparse | |||||
| import re | |||||
| import numpy as np | |||||
| def main(args): | def main(args): | ||||
| images, outputs = init_in_out(args) | images, outputs = init_in_out(args) |
| "Ollama": OllamaEmbed, | "Ollama": OllamaEmbed, | ||||
| "OpenAI": OpenAIEmbed, | "OpenAI": OpenAIEmbed, | ||||
| "Xinference": XinferenceEmbed, | "Xinference": XinferenceEmbed, | ||||
| "Tongyi-Qianwen": HuEmbedding, #QWenEmbed, | |||||
| "Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed, | |||||
| "ZHIPU-AI": ZhipuEmbed, | "ZHIPU-AI": ZhipuEmbed, | ||||
| "FastEmbed": FastEmbed, | "FastEmbed": FastEmbed, | ||||
| "Youdao": YoudaoEmbed | "Youdao": YoudaoEmbed |
| raise NotImplementedError("Please implement encode method!") | raise NotImplementedError("Please implement encode method!") | ||||
| class HuEmbedding(Base): | |||||
| class DefaultEmbedding(Base): | |||||
| def __init__(self, *args, **kwargs): | def __init__(self, *args, **kwargs): | ||||
| """ | """ | ||||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | If you have trouble downloading HuggingFace models, -_^ this might help!! |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import re | |||||
| import os | |||||
| import copy | |||||
| import base64 | |||||
| import magic | |||||
| from dataclasses import dataclass | |||||
| from typing import List | |||||
| import numpy as np | |||||
| from io import BytesIO | |||||
| class HuChunker: | |||||
| @dataclass | |||||
| class Fields: | |||||
| text_chunks: List = None | |||||
| table_chunks: List = None | |||||
| def __init__(self): | |||||
| self.MAX_LVL = 12 | |||||
| self.proj_patt = [ | |||||
| (r"第[零一二三四五六七八九十百]+章", 1), | |||||
| (r"第[零一二三四五六七八九十百]+[条节]", 2), | |||||
| (r"[零一二三四五六七八九十百]+[、 ]", 3), | |||||
| (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), | |||||
| (r"[0-9]+(、|\.[ ]|\.[^0-9])", 5), | |||||
| (r"[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 6), | |||||
| (r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7), | |||||
| (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8), | |||||
| (r".{,48}[::??]@", 9), | |||||
| (r"[0-9]+)", 10), | |||||
| (r"[\((][0-9]+[)\)]", 11), | |||||
| (r"[零一二三四五六七八九十百]+是", 12), | |||||
| (r"[⚫•➢✓ ]", 12) | |||||
| ] | |||||
| self.lines = [] | |||||
| def _garbage(self, txt): | |||||
| patt = [ | |||||
| r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)", | |||||
| r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)", | |||||
| r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)", | |||||
| r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)", | |||||
| r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)", | |||||
| r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)", | |||||
| r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)", | |||||
| r"^(时间|签字|签章)[::]", | |||||
| r"(参考文献|目录索引|图表索引)", | |||||
| r"[ ]*年[ ]+月[ ]+日", | |||||
| r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$", | |||||
| r"\.{10,}", | |||||
| r"(———————END|帮我转发|欢迎收藏|快来关注我吧)" | |||||
| ] | |||||
| return any([re.search(p, txt) for p in patt]) | |||||
| def _proj_match(self, line): | |||||
| for p, j in self.proj_patt: | |||||
| if re.match(p, line): | |||||
| return j | |||||
| return | |||||
| def _does_proj_match(self): | |||||
| mat = [None for _ in range(len(self.lines))] | |||||
| for i in range(len(self.lines)): | |||||
| mat[i] = self._proj_match(self.lines[i]) | |||||
| return mat | |||||
| def naive_text_chunk(self, text, ti="", MAX_LEN=612): | |||||
| if text: | |||||
| self.lines = [l.strip().replace(u'\u3000', u' ') | |||||
| .replace(u'\xa0', u'') | |||||
| for l in text.split("\n\n")] | |||||
| self.lines = [l for l in self.lines if not self._garbage(l)] | |||||
| self.lines = [re.sub(r"([ ]+| )", " ", l) | |||||
| for l in self.lines if l] | |||||
| if not self.lines: | |||||
| return [] | |||||
| arr = self.lines | |||||
| res = [""] | |||||
| i = 0 | |||||
| while i < len(arr): | |||||
| a = arr[i] | |||||
| if not a: | |||||
| i += 1 | |||||
| continue | |||||
| if len(a) > MAX_LEN: | |||||
| a_ = a.split("\n") | |||||
| if len(a_) >= 2: | |||||
| arr.pop(i) | |||||
| for j in range(2, len(a_) + 1): | |||||
| if len("\n".join(a_[:j])) >= MAX_LEN: | |||||
| arr.insert(i, "\n".join(a_[:j - 1])) | |||||
| arr.insert(i + 1, "\n".join(a_[j - 1:])) | |||||
| break | |||||
| else: | |||||
| assert False, f"Can't split: {a}" | |||||
| continue | |||||
| if len(res[-1]) < MAX_LEN / 3: | |||||
| res[-1] += "\n" + a | |||||
| else: | |||||
| res.append(a) | |||||
| i += 1 | |||||
| if ti: | |||||
| for i in range(len(res)): | |||||
| if res[i].find("——来自") >= 0: | |||||
| continue | |||||
| res[i] += f"\t——来自“{ti}”" | |||||
| return res | |||||
| def _merge(self): | |||||
| # merge continuous same level text | |||||
| lines = [self.lines[0]] if self.lines else [] | |||||
| for i in range(1, len(self.lines)): | |||||
| if self.mat[i] == self.mat[i - 1] \ | |||||
| and len(lines[-1]) < 256 \ | |||||
| and len(self.lines[i]) < 256: | |||||
| lines[-1] += "\n" + self.lines[i] | |||||
| continue | |||||
| lines.append(self.lines[i]) | |||||
| self.lines = lines | |||||
| self.mat = self._does_proj_match() | |||||
| return self.mat | |||||
| def text_chunks(self, text): | |||||
| if text: | |||||
| self.lines = [l.strip().replace(u'\u3000', u' ') | |||||
| .replace(u'\xa0', u'') | |||||
| for l in re.split(r"[\r\n]", text)] | |||||
| self.lines = [l for l in self.lines if not self._garbage(l)] | |||||
| self.lines = [l for l in self.lines if l] | |||||
| self.mat = self._does_proj_match() | |||||
| mat = self._merge() | |||||
| tree = [] | |||||
| for i in range(len(self.lines)): | |||||
| tree.append({"proj": mat[i], | |||||
| "children": [], | |||||
| "read": False}) | |||||
| # find all children | |||||
| for i in range(len(self.lines) - 1): | |||||
| if tree[i]["proj"] is None: | |||||
| continue | |||||
| ed = i + 1 | |||||
| while ed < len(tree) and (tree[ed]["proj"] is None or | |||||
| tree[ed]["proj"] > tree[i]["proj"]): | |||||
| ed += 1 | |||||
| nxt = tree[i]["proj"] + 1 | |||||
| st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]]) | |||||
| while nxt not in st: | |||||
| nxt += 1 | |||||
| if nxt > self.MAX_LVL: | |||||
| break | |||||
| if nxt <= self.MAX_LVL: | |||||
| for j in range(i + 1, ed): | |||||
| if tree[j]["proj"] is not None: | |||||
| break | |||||
| tree[i]["children"].append(j) | |||||
| for j in range(i + 1, ed): | |||||
| if tree[j]["proj"] != nxt: | |||||
| continue | |||||
| tree[i]["children"].append(j) | |||||
| else: | |||||
| for j in range(i + 1, ed): | |||||
| tree[i]["children"].append(j) | |||||
| # get DFS combinations, find all the paths to leaf | |||||
| paths = [] | |||||
| def dfs(i, path): | |||||
| nonlocal tree, paths | |||||
| path.append(i) | |||||
| tree[i]["read"] = True | |||||
| if len(self.lines[i]) > 256: | |||||
| paths.append(path) | |||||
| return | |||||
| if not tree[i]["children"]: | |||||
| if len(path) > 1 or len(self.lines[i]) >= 32: | |||||
| paths.append(path) | |||||
| return | |||||
| for j in tree[i]["children"]: | |||||
| dfs(j, copy.deepcopy(path)) | |||||
| for i, t in enumerate(tree): | |||||
| if t["read"]: | |||||
| continue | |||||
| dfs(i, []) | |||||
| # concat txt on the path for all paths | |||||
| res = [] | |||||
| lines = np.array(self.lines) | |||||
| for p in paths: | |||||
| if len(p) < 2: | |||||
| tree[p[0]]["read"] = False | |||||
| continue | |||||
| txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]] | |||||
| res.append(txt) | |||||
| # concat continuous orphans | |||||
| assert len(tree) == len(lines) | |||||
| ii = 0 | |||||
| while ii < len(tree): | |||||
| if tree[ii]["read"]: | |||||
| ii += 1 | |||||
| continue | |||||
| txt = lines[ii] | |||||
| e = ii + 1 | |||||
| while e < len(tree) and not tree[e]["read"] and len(txt) < 256: | |||||
| txt += "\n" + lines[e] | |||||
| e += 1 | |||||
| res.append(txt) | |||||
| ii = e | |||||
| # if the node has not been read, find its daddy | |||||
| def find_daddy(st): | |||||
| nonlocal lines, tree | |||||
| proj = tree[st]["proj"] | |||||
| if len(self.lines[st]) > 512: | |||||
| return [st] | |||||
| if proj is None: | |||||
| proj = self.MAX_LVL + 1 | |||||
| for i in range(st - 1, -1, -1): | |||||
| if tree[i]["proj"] and tree[i]["proj"] < proj: | |||||
| a = [st] + find_daddy(i) | |||||
| return a | |||||
| return [] | |||||
| return res | |||||
| class PdfChunker(HuChunker): | |||||
| def __init__(self, pdf_parser): | |||||
| self.pdf = pdf_parser | |||||
| super().__init__() | |||||
| def tableHtmls(self, pdfnm): | |||||
| _, tbls = self.pdf(pdfnm, return_html=True) | |||||
| res = [] | |||||
| for img, arr in tbls: | |||||
| if arr[0].find("<table>") < 0: | |||||
| continue | |||||
| buffered = BytesIO() | |||||
| if img: | |||||
| img.save(buffered, format="JPEG") | |||||
| img_str = base64.b64encode( | |||||
| buffered.getvalue()).decode('utf-8') if img else "" | |||||
| res.append({"table": arr[0], "image": img_str}) | |||||
| return res | |||||
| def html(self, pdfnm): | |||||
| txts, tbls = self.pdf(pdfnm, return_html=True) | |||||
| res = [] | |||||
| txt_cks = self.text_chunks(txts) | |||||
| for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c)) | |||||
| for c in txt_cks]: | |||||
| buffered = BytesIO() | |||||
| if img: | |||||
| img.save(buffered, format="JPEG") | |||||
| img_str = base64.b64encode( | |||||
| buffered.getvalue()).decode('utf-8') if img else "" | |||||
| res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"), | |||||
| "image": img_str}) | |||||
| for img, arr in tbls: | |||||
| if not arr: | |||||
| continue | |||||
| buffered = BytesIO() | |||||
| if img: | |||||
| img.save(buffered, format="JPEG") | |||||
| img_str = base64.b64encode( | |||||
| buffered.getvalue()).decode('utf-8') if img else "" | |||||
| res.append({"table": arr[0], "image": img_str}) | |||||
| return res | |||||
| def __call__(self, pdfnm, return_image=True, naive_chunk=False): | |||||
| flds = self.Fields() | |||||
| text, tbls = self.pdf(pdfnm) | |||||
| fnm = pdfnm | |||||
| txt_cks = self.text_chunks(text) if not naive_chunk else \ | |||||
| self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "") | |||||
| flds.text_chunks = [(self.pdf.remove_tag(c), | |||||
| self.pdf.crop(c) if return_image else None) for c in txt_cks] | |||||
| flds.table_chunks = [(arr, img if return_image else None) | |||||
| for img, arr in tbls] | |||||
| return flds | |||||
| class DocxChunker(HuChunker): | |||||
| def __init__(self, doc_parser): | |||||
| self.doc = doc_parser | |||||
| super().__init__() | |||||
| def _does_proj_match(self): | |||||
| mat = [] | |||||
| for s in self.styles: | |||||
| s = s.split(" ")[-1] | |||||
| try: | |||||
| mat.append(int(s)) | |||||
| except Exception as e: | |||||
| mat.append(None) | |||||
| return mat | |||||
| def _merge(self): | |||||
| i = 1 | |||||
| while i < len(self.lines): | |||||
| if self.mat[i] == self.mat[i - 1] \ | |||||
| and len(self.lines[i - 1]) < 256 \ | |||||
| and len(self.lines[i]) < 256: | |||||
| self.lines[i - 1] += "\n" + self.lines[i] | |||||
| self.styles.pop(i) | |||||
| self.lines.pop(i) | |||||
| self.mat.pop(i) | |||||
| continue | |||||
| i += 1 | |||||
| self.mat = self._does_proj_match() | |||||
| return self.mat | |||||
| def __call__(self, fnm): | |||||
| flds = self.Fields() | |||||
| flds.title = os.path.splitext( | |||||
| os.path.basename(fnm))[0] if isinstance( | |||||
| fnm, type("")) else "" | |||||
| secs, tbls = self.doc(fnm) | |||||
| self.lines = [l for l, s in secs] | |||||
| self.styles = [s for l, s in secs] | |||||
| txt_cks = self.text_chunks("") | |||||
| flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)] | |||||
| flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t] | |||||
| return flds | |||||
| class ExcelChunker(HuChunker): | |||||
| def __init__(self, excel_parser): | |||||
| self.excel = excel_parser | |||||
| super().__init__() | |||||
| def __call__(self, fnm): | |||||
| flds = self.Fields() | |||||
| flds.text_chunks = [(t, None) for t in self.excel(fnm)] | |||||
| flds.table_chunks = [] | |||||
| return flds | |||||
| class PptChunker(HuChunker): | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| def __extract(self, shape): | |||||
| if shape.shape_type == 19: | |||||
| tb = shape.table | |||||
| rows = [] | |||||
| for i in range(1, len(tb.rows)): | |||||
| rows.append("; ".join([tb.cell( | |||||
| 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |||||
| return "\n".join(rows) | |||||
| if shape.has_text_frame: | |||||
| return shape.text_frame.text | |||||
| if shape.shape_type == 6: | |||||
| texts = [] | |||||
| for p in shape.shapes: | |||||
| t = self.__extract(p) | |||||
| if t: | |||||
| texts.append(t) | |||||
| return "\n".join(texts) | |||||
| def __call__(self, fnm): | |||||
| from pptx import Presentation | |||||
| ppt = Presentation(fnm) if isinstance( | |||||
| fnm, str) else Presentation( | |||||
| BytesIO(fnm)) | |||||
| txts = [] | |||||
| for slide in ppt.slides: | |||||
| texts = [] | |||||
| for shape in slide.shapes: | |||||
| txt = self.__extract(shape) | |||||
| if txt: | |||||
| texts.append(txt) | |||||
| txts.append("\n".join(texts)) | |||||
| import aspose.slides as slides | |||||
| import aspose.pydrawing as drawing | |||||
| imgs = [] | |||||
| with slides.Presentation(BytesIO(fnm)) as presentation: | |||||
| for slide in presentation.slides: | |||||
| buffered = BytesIO() | |||||
| slide.get_thumbnail( | |||||
| 0.5, 0.5).save( | |||||
| buffered, drawing.imaging.ImageFormat.jpeg) | |||||
| imgs.append(buffered.getvalue()) | |||||
| assert len(imgs) == len( | |||||
| txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | |||||
| flds = self.Fields() | |||||
| flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))] | |||||
| flds.table_chunks = [] | |||||
| return flds | |||||
| class TextChunker(HuChunker): | |||||
| @dataclass | |||||
| class Fields: | |||||
| text_chunks: List = None | |||||
| table_chunks: List = None | |||||
| def __init__(self): | |||||
| super().__init__() | |||||
| @staticmethod | |||||
| def is_binary_file(file_path): | |||||
| mime = magic.Magic(mime=True) | |||||
| if isinstance(file_path, str): | |||||
| file_type = mime.from_file(file_path) | |||||
| else: | |||||
| file_type = mime.from_buffer(file_path) | |||||
| if 'text' in file_type: | |||||
| return False | |||||
| else: | |||||
| return True | |||||
| def __call__(self, fnm): | |||||
| flds = self.Fields() | |||||
| if self.is_binary_file(fnm): | |||||
| return flds | |||||
| txt = "" | |||||
| if isinstance(fnm, str): | |||||
| with open(fnm, "r") as f: | |||||
| txt = f.read() | |||||
| else: | |||||
| txt = fnm.decode("utf-8") | |||||
| flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)] | |||||
| flds.table_chunks = [] | |||||
| return flds | |||||
| if __name__ == "__main__": | |||||
| import sys | |||||
| sys.path.append(os.path.dirname(__file__) + "/../") | |||||
| if sys.argv[1].split(".")[-1].lower() == "pdf": | |||||
| from deepdoc.parser import PdfParser | |||||
| ckr = PdfChunker(PdfParser()) | |||||
| if sys.argv[1].split(".")[-1].lower().find("doc") >= 0: | |||||
| from deepdoc.parser import DocxParser | |||||
| ckr = DocxChunker(DocxParser()) | |||||
| if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0: | |||||
| from deepdoc.parser import ExcelParser | |||||
| ckr = ExcelChunker(ExcelParser()) | |||||
| # ckr.html(sys.argv[1]) | |||||
| print(ckr(sys.argv[1])) |
| try: | try: | ||||
| self.dictionary = json.load(open(path, 'r')) | self.dictionary = json.load(open(path, 'r')) | ||||
| except Exception as e: | except Exception as e: | ||||
| logging.warning("Missing synonym.json") | |||||
| logging.warn("Missing synonym.json") | |||||
| self.dictionary = {} | self.dictionary = {} | ||||
| if not redis: | if not redis: | ||||
| logging.warning( | logging.warning( | ||||
| "Real-time synonym is disabled, since no redis connection.") | |||||
| "Realtime synonym is disabled, since no redis connection.") | |||||
| if not len(self.dictionary.keys()): | if not len(self.dictionary.keys()): | ||||
| logging.warning(f"Fail to load synonym") | logging.warning(f"Fail to load synonym") | ||||
| from api.db.db_models import close_connection | from api.db.db_models import close_connection | ||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from rag.utils import MINIO | |||||
| from rag.utils.minio_conn import MINIO | |||||
| from rag.utils.redis_conn import REDIS_CONN | from rag.utils.redis_conn import REDIS_CONN | ||||
| from api.db.services.file_service import FileService | from api.db.services.file_service import FileService | ||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from deepdoc.parser import PdfParser | from deepdoc.parser import PdfParser | ||||
| from deepdoc.parser.excel_parser import HuExcelParser | |||||
| from deepdoc.parser.excel_parser import RAGFlowExcelParser | |||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| from rag.utils import MINIO | |||||
| from rag.utils.minio_conn import MINIO | |||||
| from rag.utils import findMaxTm | from rag.utils import findMaxTm | ||||
| import pandas as pd | import pandas as pd | ||||
| from api.db import FileType, TaskStatus | from api.db import FileType, TaskStatus | ||||
| tsks.append(task) | tsks.append(task) | ||||
| elif r["parser_id"] == "table": | elif r["parser_id"] == "table": | ||||
| rn = HuExcelParser.row_number( | |||||
| rn = RAGFlowExcelParser.row_number( | |||||
| r["name"], file_bin) | r["name"], file_bin) | ||||
| for i in range(0, rn, 3000): | for i in range(0, rn, 3000): | ||||
| task = new_task() | task = new_task() |
| from functools import partial | from functools import partial | ||||
| from api.db.services.file2document_service import File2DocumentService | from api.db.services.file2document_service import File2DocumentService | ||||
| from rag.utils import MINIO | |||||
| from rag.utils.minio_conn import MINIO | |||||
| from api.db.db_models import close_connection | from api.db.db_models import close_connection | ||||
| from rag.settings import database_logger | from rag.settings import database_logger | ||||
| from rag.settings import cron_logger, DOC_MAXIMUM_SIZE | from rag.settings import cron_logger, DOC_MAXIMUM_SIZE | ||||
| from elasticsearch_dsl import Q | from elasticsearch_dsl import Q | ||||
| from multiprocessing.context import TimeoutError | from multiprocessing.context import TimeoutError | ||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from rag.utils import ELASTICSEARCH | |||||
| from rag.utils.es_conn import ELASTICSEARCH | |||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| from rag.utils import rmSpace, findMaxTm | from rag.utils import rmSpace, findMaxTm | ||||
| return _singleton | return _singleton | ||||
| from .minio_conn import MINIO | |||||
| from .es_conn import ELASTICSEARCH | |||||
| def rmSpace(txt): | def rmSpace(txt): | ||||
| txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) | txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) | ||||
| return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) | return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) |
| @singleton | @singleton | ||||
| class HuEs: | |||||
| class ESConnection: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.info = {} | self.info = {} | ||||
| self.conn() | self.conn() | ||||
| scroll_size = len(page['hits']['hits']) | scroll_size = len(page['hits']['hits']) | ||||
| ELASTICSEARCH = HuEs() | |||||
| ELASTICSEARCH = ESConnection() |
| @singleton | @singleton | ||||
| class HuMinio(object): | |||||
| class RAGFlowMinio(object): | |||||
| def __init__(self): | def __init__(self): | ||||
| self.conn = None | self.conn = None | ||||
| self.__open__() | self.__open__() | ||||
| time.sleep(1) | time.sleep(1) | ||||
| return | return | ||||
| MINIO = HuMinio() | |||||
| MINIO = RAGFlowMinio() | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| conn = HuMinio() | |||||
| conn = RAGFlowMinio() | |||||
| fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" | fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" | ||||
| from PIL import Image | from PIL import Image | ||||
| img = Image.open(fnm) | img = Image.open(fnm) |