| @@ -18,7 +18,7 @@ from api.utils.file_utils import get_project_base_directory | |||
| from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer | |||
| from rag.nlp import huqie | |||
| from copy import deepcopy | |||
| from huggingface_hub import hf_hub_download, snapshot_download | |||
| from huggingface_hub import snapshot_download | |||
| logging.getLogger("pdfminer").setLevel(logging.WARNING) | |||
| @@ -36,18 +36,18 @@ class HuParser: | |||
| if torch.cuda.is_available(): | |||
| self.updown_cnt_mdl.set_param({"device": "cuda"}) | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/text_concat_xgb_v1.0", | |||
| local_dir=os.path.join( | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc"), | |||
| local_files_only=True) | |||
| "rag/res/deepdoc") | |||
| self.updown_cnt_mdl.load_model(os.path.join( | |||
| model_dir, "updown_concat_xgb.model")) | |||
| except Exception as e: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/text_concat_xgb_v1.0") | |||
| self.updown_cnt_mdl.load_model(os.path.join( | |||
| model_dir, "updown_concat_xgb.model")) | |||
| self.updown_cnt_mdl.load_model(os.path.join( | |||
| model_dir, "updown_concat_xgb.model")) | |||
| self.page_from = 0 | |||
| """ | |||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | |||
| @@ -17,7 +17,6 @@ from copy import deepcopy | |||
| import numpy as np | |||
| from huggingface_hub import snapshot_download | |||
| from api.db import ParserType | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from deepdoc.vision import Recognizer | |||
| @@ -39,17 +38,14 @@ class LayoutRecognizer(Recognizer): | |||
| def __init__(self, domain): | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join( | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc"), | |||
| local_files_only=True) | |||
| "rag/res/deepdoc") | |||
| super().__init__(self.labels, domain, model_dir) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||
| super().__init__(self.labels, domain, model_dir) | |||
| # os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||
| super().__init__(self.labels, domain, model_dir) | |||
| self.garbage_layouts = ["footer", "header", "reference"] | |||
| def __call__(self, image_list, ocr_res, scale_factor=3, | |||
| @@ -480,17 +480,16 @@ class OCR(object): | |||
| """ | |||
| if not model_dir: | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join( | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc"), | |||
| local_files_only=True) | |||
| "rag/res/deepdoc") | |||
| self.text_detector = TextDetector(model_dir) | |||
| self.text_recognizer = TextRecognizer(model_dir) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||
| self.text_detector = TextDetector(model_dir) | |||
| self.text_recognizer = TextRecognizer(model_dir) | |||
| self.text_detector = TextDetector(model_dir) | |||
| self.text_recognizer = TextRecognizer(model_dir) | |||
| self.drop_score = 0.5 | |||
| self.crop_image_res_index = 0 | |||
| @@ -36,17 +36,14 @@ class Recognizer(object): | |||
| """ | |||
| if not model_dir: | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join( | |||
| model_dir = os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc"), | |||
| local_files_only=True) | |||
| except Exception as e: | |||
| "rag/res/deepdoc") | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| model_file_path = os.path.join(model_dir, task_name + ".onnx") | |||
| if not os.path.exists(model_file_path): | |||
| raise ValueError("not find model file path {}".format( | |||
| model_file_path)) | |||
| @@ -35,17 +35,11 @@ class TableStructureRecognizer(Recognizer): | |||
| def __init__(self): | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="InfiniFlow/deepdoc", | |||
| local_dir=os.path.join( | |||
| super().__init__(self.labels, "tsr", os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/deepdoc"), | |||
| local_files_only=True) | |||
| "rag/res/deepdoc")) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||
| # os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||
| super().__init__(self.labels, "tsr", model_dir) | |||
| super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc")) | |||
| def __call__(self, images, thr=0.2): | |||
| tbls = super().__call__(images, thr) | |||
| @@ -28,16 +28,13 @@ from api.utils.file_utils import get_project_base_directory | |||
| from rag.utils import num_tokens_from_string | |||
| try: | |||
| model_dir = snapshot_download( | |||
| repo_id="BAAI/bge-large-zh-v1.5", | |||
| local_dir=os.path.join( | |||
| flag_model = FlagModel(os.path.join( | |||
| get_project_base_directory(), | |||
| "rag/res/bge-large-zh-v1.5"), | |||
| local_files_only=True) | |||
| query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | |||
| use_fp16=torch.cuda.is_available()) | |||
| except Exception as e: | |||
| model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5") | |||
| flag_model = FlagModel(model_dir, | |||
| flag_model = FlagModel("BAAI/bge-large-zh-v1.5", | |||
| query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | |||
| use_fp16=torch.cuda.is_available()) | |||
| @@ -247,7 +247,7 @@ class Dealer: | |||
| for ck in chunks] | |||
| cites = {} | |||
| thr = 0.63 | |||
| while len(cites.keys()) == 0 and pieces_ and chunks_tks: | |||
| while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks: | |||
| for i, a in enumerate(pieces_): | |||
| sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | |||
| chunk_v, | |||