瀏覽代碼

refactor code (#583)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
tags/v0.5.0
KevinHuSh 1 年之前
父節點
當前提交
9d60a84958
No account linked to committer's email address

+ 1
- 1
api/apps/api_app.py 查看文件

from itsdangerous import URLSafeTimedSerializer from itsdangerous import URLSafeTimedSerializer
from api.utils.file_utils import filename_type, thumbnail from api.utils.file_utils import filename_type, thumbnail
from rag.utils import MINIO
from rag.utils.minio_conn import MINIO
def generate_confirmation_token(tenent_id): def generate_confirmation_token(tenent_id):

+ 2
- 1
api/apps/chunk_app.py 查看文件

from rag.app.qa import rmPrefix, beAdoc from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, huqie from rag.nlp import search, huqie
from rag.utils import ELASTICSEARCH, rmSpace
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import TenantLLMService from api.db.services.llm_service import TenantLLMService

+ 1
- 1
api/apps/document_app.py 查看文件

from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from rag.nlp import search from rag.nlp import search
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request

+ 1
- 1
api/apps/file2document_app.py 查看文件

from api.settings import RetCode from api.settings import RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from rag.nlp import search from rag.nlp import search
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH




@manager.route('/convert', methods=['POST']) @manager.route('/convert', methods=['POST'])

+ 1
- 1
api/apps/file_app.py 查看文件

from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from api.utils.file_utils import filename_type from api.utils.file_utils import filename_type
from rag.nlp import search from rag.nlp import search
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO





+ 1
- 1
api/apps/kb_app.py 查看文件

from api.settings import stat_logger, RetCode from api.settings import stat_logger, RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from rag.nlp import search from rag.nlp import search
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
@manager.route('/create', methods=['post']) @manager.route('/create', methods=['post'])

+ 1
- 1
api/db/services/document_service.py 查看文件

from peewee import Expression from peewee import Expression
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO
from rag.nlp import search from rag.nlp import search

+ 1
- 1
api/settings.py 查看文件

database_logger = getLogger("database") database_logger = getLogger("database")
chat_logger = getLogger("chat") chat_logger = getLogger("chat")
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
from rag.nlp import search from rag.nlp import search
from api.utils import get_base_config, decrypt_database_config from api.utils import get_base_config, decrypt_database_config

+ 4
- 4
deepdoc/parser/__init__.py 查看文件





from .pdf_parser import HuParser as PdfParser, PlainParser
from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser
from .ppt_parser import HuPptParser as PptParser
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
from .docx_parser import RAGFlowDocxParser as DocxParser
from .excel_parser import RAGFlowExcelParser as ExcelParser
from .ppt_parser import RAGFlowPptParser as PptParser

+ 1
- 1
deepdoc/parser/docx_parser.py 查看文件

from io import BytesIO from io import BytesIO




class HuDocxParser:
class RAGFlowDocxParser:


def __extract_table_content(self, tb): def __extract_table_content(self, tb):
df = [] df = []

+ 2
- 2
deepdoc/parser/excel_parser.py 查看文件

from rag.nlp import find_codec from rag.nlp import find_codec




class HuExcelParser:
class RAGFlowExcelParser:
def html(self, fnm): def html(self, fnm):
if isinstance(fnm, str): if isinstance(fnm, str):
wb = load_workbook(fnm) wb = load_workbook(fnm)




if __name__ == "__main__": if __name__ == "__main__":
psr = HuExcelParser()
psr = RAGFlowExcelParser()
psr(sys.argv[1]) psr(sys.argv[1])

+ 1
- 1
deepdoc/parser/pdf_parser.py 查看文件

logging.getLogger("pdfminer").setLevel(logging.WARNING) logging.getLogger("pdfminer").setLevel(logging.WARNING)




class HuParser:
class RAGFlowPdfParser:
def __init__(self): def __init__(self):
self.ocr = OCR() self.ocr = OCR()
if hasattr(self, "model_speciess"): if hasattr(self, "model_speciess"):

+ 1
- 1
deepdoc/parser/ppt_parser.py 查看文件

from pptx import Presentation from pptx import Presentation
class HuPptParser(object):
class RAGFlowPptParser(object):
def __init__(self): def __init__(self):
super().__init__() super().__init__()

+ 5
- 4
deepdoc/vision/t_ocr.py 查看文件

# limitations under the License. # limitations under the License.
# #
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import OCR, init_in_out
import argparse
import numpy as np
import os import os
import sys import sys
sys.path.insert( sys.path.insert(
os.path.abspath(__file__)), os.path.abspath(__file__)),
'../../'))) '../../')))
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import OCR, init_in_out
import argparse
import numpy as np
def main(args): def main(args):
ocr = OCR() ocr = OCR()

+ 8
- 11
deepdoc/vision/t_recognizer.py 查看文件

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
from api.utils.file_utils import get_project_base_directory
import argparse
import os
import sys
import re
import numpy as np
import os, sys
sys.path.insert( sys.path.insert(
0, 0,
os.path.abspath( os.path.abspath(
os.path.abspath(__file__)), os.path.abspath(__file__)),
'../../'))) '../../')))
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
from api.utils.file_utils import get_project_base_directory
import argparse
import re
import numpy as np
def main(args): def main(args):
images, outputs = init_in_out(args) images, outputs = init_in_out(args)

+ 1
- 1
rag/llm/__init__.py 查看文件

"Ollama": OllamaEmbed, "Ollama": OllamaEmbed,
"OpenAI": OpenAIEmbed, "OpenAI": OpenAIEmbed,
"Xinference": XinferenceEmbed, "Xinference": XinferenceEmbed,
"Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
"Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed,
"ZHIPU-AI": ZhipuEmbed, "ZHIPU-AI": ZhipuEmbed,
"FastEmbed": FastEmbed, "FastEmbed": FastEmbed,
"Youdao": YoudaoEmbed "Youdao": YoudaoEmbed

+ 1
- 1
rag/llm/embedding_model.py 查看文件

raise NotImplementedError("Please implement encode method!") raise NotImplementedError("Please implement encode method!")




class HuEmbedding(Base):
class DefaultEmbedding(Base):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
""" """
If you have trouble downloading HuggingFace models, -_^ this might help!! If you have trouble downloading HuggingFace models, -_^ this might help!!

+ 0
- 475
rag/nlp/huchunk.py 查看文件

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
import os
import copy
import base64
import magic
from dataclasses import dataclass
from typing import List
import numpy as np
from io import BytesIO


class HuChunker:

@dataclass
class Fields:
text_chunks: List = None
table_chunks: List = None

def __init__(self):
self.MAX_LVL = 12
self.proj_patt = [
(r"第[零一二三四五六七八九十百]+章", 1),
(r"第[零一二三四五六七八九十百]+[条节]", 2),
(r"[零一二三四五六七八九十百]+[、  ]", 3),
(r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
(r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
(r"[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 6),
(r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
(r".{,48}[::??]@", 9),
(r"[0-9]+)", 10),
(r"[\((][0-9]+[)\)]", 11),
(r"[零一二三四五六七八九十百]+是", 12),
(r"[⚫•➢✓ ]", 12)
]
self.lines = []

def _garbage(self, txt):
patt = [
r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
r"^(时间|签字|签章)[::]",
r"(参考文献|目录索引|图表索引)",
r"[ ]*年[ ]+月[ ]+日",
r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
r"\.{10,}",
r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
]
return any([re.search(p, txt) for p in patt])

def _proj_match(self, line):
for p, j in self.proj_patt:
if re.match(p, line):
return j
return

def _does_proj_match(self):
mat = [None for _ in range(len(self.lines))]
for i in range(len(self.lines)):
mat[i] = self._proj_match(self.lines[i])
return mat

def naive_text_chunk(self, text, ti="", MAX_LEN=612):
if text:
self.lines = [l.strip().replace(u'\u3000', u' ')
.replace(u'\xa0', u'')
for l in text.split("\n\n")]
self.lines = [l for l in self.lines if not self._garbage(l)]
self.lines = [re.sub(r"([ ]+| )", " ", l)
for l in self.lines if l]
if not self.lines:
return []
arr = self.lines

res = [""]
i = 0
while i < len(arr):
a = arr[i]
if not a:
i += 1
continue
if len(a) > MAX_LEN:
a_ = a.split("\n")
if len(a_) >= 2:
arr.pop(i)
for j in range(2, len(a_) + 1):
if len("\n".join(a_[:j])) >= MAX_LEN:
arr.insert(i, "\n".join(a_[:j - 1]))
arr.insert(i + 1, "\n".join(a_[j - 1:]))
break
else:
assert False, f"Can't split: {a}"
continue

if len(res[-1]) < MAX_LEN / 3:
res[-1] += "\n" + a
else:
res.append(a)
i += 1

if ti:
for i in range(len(res)):
if res[i].find("——来自") >= 0:
continue
res[i] += f"\t——来自“{ti}”"

return res

def _merge(self):
# merge continuous same level text
lines = [self.lines[0]] if self.lines else []
for i in range(1, len(self.lines)):
if self.mat[i] == self.mat[i - 1] \
and len(lines[-1]) < 256 \
and len(self.lines[i]) < 256:
lines[-1] += "\n" + self.lines[i]
continue
lines.append(self.lines[i])
self.lines = lines
self.mat = self._does_proj_match()
return self.mat

def text_chunks(self, text):
if text:
self.lines = [l.strip().replace(u'\u3000', u' ')
.replace(u'\xa0', u'')
for l in re.split(r"[\r\n]", text)]
self.lines = [l for l in self.lines if not self._garbage(l)]
self.lines = [l for l in self.lines if l]
self.mat = self._does_proj_match()
mat = self._merge()

tree = []
for i in range(len(self.lines)):
tree.append({"proj": mat[i],
"children": [],
"read": False})
# find all children
for i in range(len(self.lines) - 1):
if tree[i]["proj"] is None:
continue
ed = i + 1
while ed < len(tree) and (tree[ed]["proj"] is None or
tree[ed]["proj"] > tree[i]["proj"]):
ed += 1

nxt = tree[i]["proj"] + 1
st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
while nxt not in st:
nxt += 1
if nxt > self.MAX_LVL:
break
if nxt <= self.MAX_LVL:
for j in range(i + 1, ed):
if tree[j]["proj"] is not None:
break
tree[i]["children"].append(j)
for j in range(i + 1, ed):
if tree[j]["proj"] != nxt:
continue
tree[i]["children"].append(j)
else:
for j in range(i + 1, ed):
tree[i]["children"].append(j)

# get DFS combinations, find all the paths to leaf
paths = []

def dfs(i, path):
nonlocal tree, paths
path.append(i)
tree[i]["read"] = True
if len(self.lines[i]) > 256:
paths.append(path)
return
if not tree[i]["children"]:
if len(path) > 1 or len(self.lines[i]) >= 32:
paths.append(path)
return
for j in tree[i]["children"]:
dfs(j, copy.deepcopy(path))

for i, t in enumerate(tree):
if t["read"]:
continue
dfs(i, [])

# concat txt on the path for all paths
res = []
lines = np.array(self.lines)
for p in paths:
if len(p) < 2:
tree[p[0]]["read"] = False
continue
txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
res.append(txt)
# concat continuous orphans
assert len(tree) == len(lines)
ii = 0
while ii < len(tree):
if tree[ii]["read"]:
ii += 1
continue
txt = lines[ii]
e = ii + 1
while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
txt += "\n" + lines[e]
e += 1
res.append(txt)
ii = e

# if the node has not been read, find its daddy
def find_daddy(st):
nonlocal lines, tree
proj = tree[st]["proj"]
if len(self.lines[st]) > 512:
return [st]
if proj is None:
proj = self.MAX_LVL + 1
for i in range(st - 1, -1, -1):
if tree[i]["proj"] and tree[i]["proj"] < proj:
a = [st] + find_daddy(i)
return a
return []

return res


class PdfChunker(HuChunker):

def __init__(self, pdf_parser):
self.pdf = pdf_parser
super().__init__()

def tableHtmls(self, pdfnm):
_, tbls = self.pdf(pdfnm, return_html=True)
res = []
for img, arr in tbls:
if arr[0].find("<table>") < 0:
continue
buffered = BytesIO()
if img:
img.save(buffered, format="JPEG")
img_str = base64.b64encode(
buffered.getvalue()).decode('utf-8') if img else ""
res.append({"table": arr[0], "image": img_str})
return res

def html(self, pdfnm):
txts, tbls = self.pdf(pdfnm, return_html=True)
res = []
txt_cks = self.text_chunks(txts)
for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
for c in txt_cks]:
buffered = BytesIO()
if img:
img.save(buffered, format="JPEG")
img_str = base64.b64encode(
buffered.getvalue()).decode('utf-8') if img else ""
res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
"image": img_str})

for img, arr in tbls:
if not arr:
continue
buffered = BytesIO()
if img:
img.save(buffered, format="JPEG")
img_str = base64.b64encode(
buffered.getvalue()).decode('utf-8') if img else ""
res.append({"table": arr[0], "image": img_str})

return res

def __call__(self, pdfnm, return_image=True, naive_chunk=False):
flds = self.Fields()
text, tbls = self.pdf(pdfnm)
fnm = pdfnm
txt_cks = self.text_chunks(text) if not naive_chunk else \
self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
flds.text_chunks = [(self.pdf.remove_tag(c),
self.pdf.crop(c) if return_image else None) for c in txt_cks]

flds.table_chunks = [(arr, img if return_image else None)
for img, arr in tbls]
return flds


class DocxChunker(HuChunker):

def __init__(self, doc_parser):
self.doc = doc_parser
super().__init__()

def _does_proj_match(self):
mat = []
for s in self.styles:
s = s.split(" ")[-1]
try:
mat.append(int(s))
except Exception as e:
mat.append(None)
return mat

def _merge(self):
i = 1
while i < len(self.lines):
if self.mat[i] == self.mat[i - 1] \
and len(self.lines[i - 1]) < 256 \
and len(self.lines[i]) < 256:
self.lines[i - 1] += "\n" + self.lines[i]
self.styles.pop(i)
self.lines.pop(i)
self.mat.pop(i)
continue
i += 1
self.mat = self._does_proj_match()
return self.mat

def __call__(self, fnm):
flds = self.Fields()
flds.title = os.path.splitext(
os.path.basename(fnm))[0] if isinstance(
fnm, type("")) else ""
secs, tbls = self.doc(fnm)
self.lines = [l for l, s in secs]
self.styles = [s for l, s in secs]

txt_cks = self.text_chunks("")
flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
return flds


class ExcelChunker(HuChunker):

def __init__(self, excel_parser):
self.excel = excel_parser
super().__init__()

def __call__(self, fnm):
flds = self.Fields()
flds.text_chunks = [(t, None) for t in self.excel(fnm)]
flds.table_chunks = []
return flds


class PptChunker(HuChunker):

def __init__(self):
super().__init__()

def __extract(self, shape):
if shape.shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)

if shape.has_text_frame:
return shape.text_frame.text

if shape.shape_type == 6:
texts = []
for p in shape.shapes:
t = self.__extract(p)
if t:
texts.append(t)
return "\n".join(texts)

def __call__(self, fnm):
from pptx import Presentation
ppt = Presentation(fnm) if isinstance(
fnm, str) else Presentation(
BytesIO(fnm))
txts = []
for slide in ppt.slides:
texts = []
for shape in slide.shapes:
txt = self.__extract(shape)
if txt:
texts.append(txt)
txts.append("\n".join(texts))

import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for slide in presentation.slides:
buffered = BytesIO()
slide.get_thumbnail(
0.5, 0.5).save(
buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(buffered.getvalue())
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))

flds = self.Fields()
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
flds.table_chunks = []

return flds


class TextChunker(HuChunker):

@dataclass
class Fields:
text_chunks: List = None
table_chunks: List = None

def __init__(self):
super().__init__()

@staticmethod
def is_binary_file(file_path):
mime = magic.Magic(mime=True)
if isinstance(file_path, str):
file_type = mime.from_file(file_path)
else:
file_type = mime.from_buffer(file_path)
if 'text' in file_type:
return False
else:
return True

def __call__(self, fnm):
flds = self.Fields()
if self.is_binary_file(fnm):
return flds
txt = ""
if isinstance(fnm, str):
with open(fnm, "r") as f:
txt = f.read()
else:
txt = fnm.decode("utf-8")
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
flds.table_chunks = []
return flds


if __name__ == "__main__":
import sys
sys.path.append(os.path.dirname(__file__) + "/../")
if sys.argv[1].split(".")[-1].lower() == "pdf":
from deepdoc.parser import PdfParser
ckr = PdfChunker(PdfParser())
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
from deepdoc.parser import DocxParser
ckr = DocxChunker(DocxParser())
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
from deepdoc.parser import ExcelParser
ckr = ExcelChunker(ExcelParser())

# ckr.html(sys.argv[1])
print(ckr(sys.argv[1]))

+ 2
- 2
rag/nlp/synonym.py 查看文件

try: try:
self.dictionary = json.load(open(path, 'r')) self.dictionary = json.load(open(path, 'r'))
except Exception as e: except Exception as e:
logging.warning("Missing synonym.json")
logging.warn("Missing synonym.json")
self.dictionary = {} self.dictionary = {}


if not redis: if not redis:
logging.warning( logging.warning(
"Real-time synonym is disabled, since no redis connection.")
"Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()): if not len(self.dictionary.keys()):
logging.warning(f"Fail to load synonym") logging.warning(f"Fail to load synonym")



+ 1
- 1
rag/svr/cache_file_svr.py 查看文件

from api.db.db_models import close_connection from api.db.db_models import close_connection
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.utils import MINIO
from rag.utils.minio_conn import MINIO
from rag.utils.redis_conn import REDIS_CONN from rag.utils.redis_conn import REDIS_CONN

+ 3
- 3
rag/svr/task_broker.py 查看文件

from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from deepdoc.parser import PdfParser from deepdoc.parser import PdfParser
from deepdoc.parser.excel_parser import HuExcelParser
from deepdoc.parser.excel_parser import RAGFlowExcelParser
from rag.settings import cron_logger from rag.settings import cron_logger
from rag.utils import MINIO
from rag.utils.minio_conn import MINIO
from rag.utils import findMaxTm from rag.utils import findMaxTm
import pandas as pd import pandas as pd
from api.db import FileType, TaskStatus from api.db import FileType, TaskStatus
tsks.append(task) tsks.append(task)
elif r["parser_id"] == "table": elif r["parser_id"] == "table":
rn = HuExcelParser.row_number(
rn = RAGFlowExcelParser.row_number(
r["name"], file_bin) r["name"], file_bin)
for i in range(0, rn, 3000): for i in range(0, rn, 3000):
task = new_task() task = new_task()

+ 2
- 2
rag/svr/task_executor.py 查看文件

from functools import partial from functools import partial


from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from rag.utils import MINIO
from rag.utils.minio_conn import MINIO
from api.db.db_models import close_connection from api.db.db_models import close_connection
from rag.settings import database_logger from rag.settings import database_logger
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from multiprocessing.context import TimeoutError from multiprocessing.context import TimeoutError
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.utils import ELASTICSEARCH
from rag.utils.es_conn import ELASTICSEARCH
from timeit import default_timer as timer from timeit import default_timer as timer
from rag.utils import rmSpace, findMaxTm from rag.utils import rmSpace, findMaxTm



+ 0
- 3
rag/utils/__init__.py 查看文件

return _singleton return _singleton




from .minio_conn import MINIO
from .es_conn import ELASTICSEARCH

def rmSpace(txt): def rmSpace(txt):
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE) txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE) return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)

+ 2
- 2
rag/utils/es_conn.py 查看文件





@singleton @singleton
class HuEs:
class ESConnection:
def __init__(self): def __init__(self):
self.info = {} self.info = {}
self.conn() self.conn()
scroll_size = len(page['hits']['hits']) scroll_size = len(page['hits']['hits'])




ELASTICSEARCH = HuEs()
ELASTICSEARCH = ESConnection()

+ 5
- 3
rag/utils/minio_conn.py 查看文件





@singleton @singleton
class HuMinio(object):
class RAGFlowMinio(object):
def __init__(self): def __init__(self):
self.conn = None self.conn = None
self.__open__() self.__open__()
time.sleep(1) time.sleep(1)
return return


MINIO = HuMinio()

MINIO = RAGFlowMinio()



if __name__ == "__main__": if __name__ == "__main__":
conn = HuMinio()
conn = RAGFlowMinio()
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg" fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
from PIL import Image from PIL import Image
img = Image.open(fnm) img = Image.open(fnm)

Loading…
取消
儲存