| from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer | from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer | ||||
| from rag.nlp import huqie | from rag.nlp import huqie | ||||
| from copy import deepcopy | from copy import deepcopy | ||||
| from huggingface_hub import hf_hub_download, snapshot_download | |||||
| from huggingface_hub import snapshot_download | |||||
| logging.getLogger("pdfminer").setLevel(logging.WARNING) | logging.getLogger("pdfminer").setLevel(logging.WARNING) | ||||
| if torch.cuda.is_available(): | if torch.cuda.is_available(): | ||||
| self.updown_cnt_mdl.set_param({"device": "cuda"}) | self.updown_cnt_mdl.set_param({"device": "cuda"}) | ||||
| try: | try: | ||||
| model_dir = snapshot_download( | |||||
| repo_id="InfiniFlow/text_concat_xgb_v1.0", | |||||
| local_dir=os.path.join( | |||||
| model_dir = os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/deepdoc"), | |||||
| local_files_only=True) | |||||
| "rag/res/deepdoc") | |||||
| self.updown_cnt_mdl.load_model(os.path.join( | |||||
| model_dir, "updown_concat_xgb.model")) | |||||
| except Exception as e: | except Exception as e: | ||||
| model_dir = snapshot_download( | model_dir = snapshot_download( | ||||
| repo_id="InfiniFlow/text_concat_xgb_v1.0") | repo_id="InfiniFlow/text_concat_xgb_v1.0") | ||||
| self.updown_cnt_mdl.load_model(os.path.join( | |||||
| model_dir, "updown_concat_xgb.model")) | |||||
| self.updown_cnt_mdl.load_model(os.path.join( | |||||
| model_dir, "updown_concat_xgb.model")) | |||||
| self.page_from = 0 | self.page_from = 0 | ||||
| """ | """ | ||||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | If you have trouble downloading HuggingFace models, -_^ this might help!! |
| import numpy as np | import numpy as np | ||||
| from huggingface_hub import snapshot_download | from huggingface_hub import snapshot_download | ||||
| from api.db import ParserType | |||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| from deepdoc.vision import Recognizer | from deepdoc.vision import Recognizer | ||||
| def __init__(self, domain): | def __init__(self, domain): | ||||
| try: | try: | ||||
| model_dir = snapshot_download( | |||||
| repo_id="InfiniFlow/deepdoc", | |||||
| local_dir=os.path.join( | |||||
| model_dir = os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/deepdoc"), | |||||
| local_files_only=True) | |||||
| "rag/res/deepdoc") | |||||
| super().__init__(self.labels, domain, model_dir) | |||||
| except Exception as e: | except Exception as e: | ||||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | ||||
| super().__init__(self.labels, domain, model_dir) | |||||
| # os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||||
| super().__init__(self.labels, domain, model_dir) | |||||
| self.garbage_layouts = ["footer", "header", "reference"] | self.garbage_layouts = ["footer", "header", "reference"] | ||||
| def __call__(self, image_list, ocr_res, scale_factor=3, | def __call__(self, image_list, ocr_res, scale_factor=3, |
| """ | """ | ||||
| if not model_dir: | if not model_dir: | ||||
| try: | try: | ||||
| model_dir = snapshot_download( | |||||
| repo_id="InfiniFlow/deepdoc", | |||||
| local_dir=os.path.join( | |||||
| model_dir = os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/deepdoc"), | |||||
| local_files_only=True) | |||||
| "rag/res/deepdoc") | |||||
| self.text_detector = TextDetector(model_dir) | |||||
| self.text_recognizer = TextRecognizer(model_dir) | |||||
| except Exception as e: | except Exception as e: | ||||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | ||||
| self.text_detector = TextDetector(model_dir) | |||||
| self.text_recognizer = TextRecognizer(model_dir) | |||||
| self.text_detector = TextDetector(model_dir) | |||||
| self.text_recognizer = TextRecognizer(model_dir) | |||||
| self.drop_score = 0.5 | self.drop_score = 0.5 | ||||
| self.crop_image_res_index = 0 | self.crop_image_res_index = 0 | ||||
| """ | """ | ||||
| if not model_dir: | if not model_dir: | ||||
| try: | |||||
| model_dir = snapshot_download( | |||||
| repo_id="InfiniFlow/deepdoc", | |||||
| local_dir=os.path.join( | |||||
| model_dir = os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/deepdoc"), | |||||
| local_files_only=True) | |||||
| except Exception as e: | |||||
| "rag/res/deepdoc") | |||||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||||
| if not os.path.exists(model_file_path): | |||||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | ||||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||||
| if not os.path.exists(model_file_path): | if not os.path.exists(model_file_path): | ||||
| raise ValueError("not find model file path {}".format( | raise ValueError("not find model file path {}".format( | ||||
| model_file_path)) | model_file_path)) |
| def __init__(self): | def __init__(self): | ||||
| try: | try: | ||||
| model_dir = snapshot_download( | |||||
| repo_id="InfiniFlow/deepdoc", | |||||
| local_dir=os.path.join( | |||||
| super().__init__(self.labels, "tsr", os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/deepdoc"), | |||||
| local_files_only=True) | |||||
| "rag/res/deepdoc")) | |||||
| except Exception as e: | except Exception as e: | ||||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||||
| # os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||||
| super().__init__(self.labels, "tsr", model_dir) | |||||
| super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc")) | |||||
| def __call__(self, images, thr=0.2): | def __call__(self, images, thr=0.2): | ||||
| tbls = super().__call__(images, thr) | tbls = super().__call__(images, thr) |
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| try: | try: | ||||
| model_dir = snapshot_download( | |||||
| repo_id="BAAI/bge-large-zh-v1.5", | |||||
| local_dir=os.path.join( | |||||
| flag_model = FlagModel(os.path.join( | |||||
| get_project_base_directory(), | get_project_base_directory(), | ||||
| "rag/res/bge-large-zh-v1.5"), | "rag/res/bge-large-zh-v1.5"), | ||||
| local_files_only=True) | |||||
| query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | |||||
| use_fp16=torch.cuda.is_available()) | |||||
| except Exception as e: | except Exception as e: | ||||
| model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5") | |||||
| flag_model = FlagModel(model_dir, | |||||
| flag_model = FlagModel("BAAI/bge-large-zh-v1.5", | |||||
| query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | ||||
| use_fp16=torch.cuda.is_available()) | use_fp16=torch.cuda.is_available()) | ||||
| for ck in chunks] | for ck in chunks] | ||||
| cites = {} | cites = {} | ||||
| thr = 0.63 | thr = 0.63 | ||||
| while len(cites.keys()) == 0 and pieces_ and chunks_tks: | |||||
| while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks: | |||||
| for i, a in enumerate(pieces_): | for i, a in enumerate(pieces_): | ||||
| sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | ||||
| chunk_v, | chunk_v, |