浏览代码

let's load model from local (#163)

tags/v0.1.0
KevinHuSh 1年前
父节点
当前提交
a5384446e3
没有帐户链接到提交者的电子邮件

+ 8
- 8
deepdoc/parser/pdf_parser.py 查看文件

from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie from rag.nlp import huqie
from copy import deepcopy from copy import deepcopy
from huggingface_hub import hf_hub_download, snapshot_download
from huggingface_hub import snapshot_download


logging.getLogger("pdfminer").setLevel(logging.WARNING) logging.getLogger("pdfminer").setLevel(logging.WARNING)


if torch.cuda.is_available(): if torch.cuda.is_available():
self.updown_cnt_mdl.set_param({"device": "cuda"}) self.updown_cnt_mdl.set_param({"device": "cuda"})
try: try:
model_dir = snapshot_download(
repo_id="InfiniFlow/text_concat_xgb_v1.0",
local_dir=os.path.join(
model_dir = os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
"rag/res/deepdoc")
self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))
except Exception as e: except Exception as e:
model_dir = snapshot_download( model_dir = snapshot_download(
repo_id="InfiniFlow/text_concat_xgb_v1.0") repo_id="InfiniFlow/text_concat_xgb_v1.0")
self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))



self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))
self.page_from = 0 self.page_from = 0
""" """
If you have trouble downloading HuggingFace models, -_^ this might help!! If you have trouble downloading HuggingFace models, -_^ this might help!!

+ 4
- 8
deepdoc/vision/layout_recognizer.py 查看文件

import numpy as np import numpy as np
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from api.db import ParserType
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer from deepdoc.vision import Recognizer
def __init__(self, domain): def __init__(self, domain):
try: try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
model_dir = os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception as e: except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, domain, model_dir)
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
super().__init__(self.labels, domain, model_dir)
self.garbage_layouts = ["footer", "header", "reference"] self.garbage_layouts = ["footer", "header", "reference"]
def __call__(self, image_list, ocr_res, scale_factor=3, def __call__(self, image_list, ocr_res, scale_factor=3,

+ 6
- 7
deepdoc/vision/ocr.py 查看文件

""" """
if not model_dir: if not model_dir:
try: try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
model_dir = os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
"rag/res/deepdoc")
self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)
except Exception as e: except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)


self.text_detector = TextDetector(model_dir)
self.text_recognizer = TextRecognizer(model_dir)
self.drop_score = 0.5 self.drop_score = 0.5
self.crop_image_res_index = 0 self.crop_image_res_index = 0



+ 5
- 8
deepdoc/vision/recognizer.py 查看文件

""" """
if not model_dir: if not model_dir:
try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
model_dir = os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
"rag/res/deepdoc")
model_file_path = os.path.join(model_dir, task_name + ".onnx")
if not os.path.exists(model_file_path):
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
model_file_path = os.path.join(model_dir, task_name + ".onnx")
model_file_path = os.path.join(model_dir, task_name + ".onnx")
if not os.path.exists(model_file_path): if not os.path.exists(model_file_path):
raise ValueError("not find model file path {}".format( raise ValueError("not find model file path {}".format(
model_file_path)) model_file_path))

+ 3
- 9
deepdoc/vision/table_structure_recognizer.py 查看文件

def __init__(self): def __init__(self):
try: try:
model_dir = snapshot_download(
repo_id="InfiniFlow/deepdoc",
local_dir=os.path.join(
super().__init__(self.labels, "tsr", os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/deepdoc"),
local_files_only=True)
"rag/res/deepdoc"))
except Exception as e: except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
super().__init__(self.labels, "tsr", model_dir)
super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc"))
def __call__(self, images, thr=0.2): def __call__(self, images, thr=0.2):
tbls = super().__call__(images, thr) tbls = super().__call__(images, thr)

+ 4
- 7
rag/llm/embedding_model.py 查看文件

from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string


try: try:
model_dir = snapshot_download(
repo_id="BAAI/bge-large-zh-v1.5",
local_dir=os.path.join(
flag_model = FlagModel(os.path.join(
get_project_base_directory(), get_project_base_directory(),
"rag/res/bge-large-zh-v1.5"), "rag/res/bge-large-zh-v1.5"),
local_files_only=True)
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())
except Exception as e: except Exception as e:
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5")

flag_model = FlagModel(model_dir,
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available()) use_fp16=torch.cuda.is_available())



+ 1
- 1
rag/nlp/search.py 查看文件

for ck in chunks] for ck in chunks]
cites = {} cites = {}
thr = 0.63 thr = 0.63
while len(cites.keys()) == 0 and pieces_ and chunks_tks:
while thr>0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
for i, a in enumerate(pieces_): for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v, chunk_v,

正在加载...
取消
保存