瀏覽代碼

Fix typo in code (#8327)

### What problem does this PR solve?

Fix typo in code

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
tags/v0.19.1
Jin Hai 4 月之前
父節點
當前提交
4a2ff633e0
No account linked to committer's email address

+ 2
- 2
api/db/init_data.py 查看文件

{"role": "user", "content": "Hello!"}], gen_conf={}) {"role": "user", "content": "Hello!"}], gen_conf={})
if msg.find("ERROR: ") == 0: if msg.find("ERROR: ") == 0:
logging.error( logging.error(
"'{}' dosen't work. {}".format(
"'{}' doesn't work. {}".format(
tenant["llm_id"], tenant["llm_id"],
msg)) msg))
embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"])
v, c = embd_mdl.encode(["Hello!"]) v, c = embd_mdl.encode(["Hello!"])
if c == 0: if c == 0:
logging.error( logging.error(
"'{}' dosen't work!".format(
"'{}' doesn't work!".format(
tenant["embd_id"])) tenant["embd_id"]))





+ 9
- 9
api/db/services/canvas_service.py 查看文件

User.nickname, User.nickname,
User.avatar.alias('tenant_avatar'), User.avatar.alias('tenant_avatar'),
] ]
angents = cls.model.select(*fields) \
agents = cls.model.select(*fields) \
.join(User, on=(cls.model.user_id == User.id)) \ .join(User, on=(cls.model.user_id == User.id)) \
.where(cls.model.id == pid) .where(cls.model.id == pid)
# obj = cls.model.query(id=pid)[0] # obj = cls.model.query(id=pid)[0]
return True, angents.dicts()[0]
return True, agents.dicts()[0]
except Exception as e: except Exception as e:
print(e) print(e)
return False, None return False, None
cls.model.update_time cls.model.update_time
] ]
if keywords: if keywords:
angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | ( TenantPermission.TEAM.value)) | (
cls.model.user_id == user_id)), cls.model.user_id == user_id)),
(fn.LOWER(cls.model.title).contains(keywords.lower())) (fn.LOWER(cls.model.title).contains(keywords.lower()))
) )
else: else:
angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | ( TenantPermission.TEAM.value)) | (
cls.model.user_id == user_id)) cls.model.user_id == user_id))
) )
if desc: if desc:
angents = angents.order_by(cls.model.getter_by(orderby).desc())
agents = agents.order_by(cls.model.getter_by(orderby).desc())
else: else:
angents = angents.order_by(cls.model.getter_by(orderby).asc())
count = angents.count()
angents = angents.paginate(page_number, items_per_page)
return list(angents.dicts()), count
agents = agents.order_by(cls.model.getter_by(orderby).asc())
count = agents.count()
agents = agents.paginate(page_number, items_per_page)
return list(agents.dicts()), count


def completion(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): def completion(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):

+ 2
- 2
api/ragflow_server.py 查看文件

# from beartype.claw import beartype_all # <-- you didn't sign up for this # from beartype.claw import beartype_all # <-- you didn't sign up for this
# beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code # beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code


from api.utils.log_utils import initRootLogger
from api.utils.log_utils import init_root_logger
from plugin import GlobalPluginManager from plugin import GlobalPluginManager
initRootLogger("ragflow_server")
init_root_logger("ragflow_server")


import logging import logging
import os import os

+ 1
- 1
api/utils/file_utils.py 查看文件

if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
return FileType.DOC.value return FileType.DOC.value


if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
return FileType.AURAL.value return FileType.AURAL.value


if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):

+ 1
- 1
api/utils/log_utils.py 查看文件

) )
return PROJECT_BASE return PROJECT_BASE


def initRootLogger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
def init_root_logger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
global initialized_root_logger global initialized_root_logger
if initialized_root_logger: if initialized_root_logger:
return return

+ 3
- 3
api/utils/t_crypt.py 查看文件





if __name__ == "__main__": if __name__ == "__main__":
pswd = crypt(sys.argv[1])
print(pswd)
print(decrypt(pswd))
passwd = crypt(sys.argv[1])
print(passwd)
print(decrypt(passwd))

+ 2
- 2
api/utils/validation_utils.py 查看文件

team = auto() team = auto()




class ChunkMethodnEnum(StrEnum):
class ChunkMethodEnum(StrEnum):
naive = auto() naive = auto()
book = auto() book = auto()
email = auto() email = auto()
description: str | None = Field(default=None, max_length=65535) description: str | None = Field(default=None, max_length=65535)
embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")] embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")]
permission: PermissionEnum = Field(default=PermissionEnum.me, min_length=1, max_length=16) permission: PermissionEnum = Field(default=PermissionEnum.me, min_length=1, max_length=16)
chunk_method: ChunkMethodnEnum = Field(default=ChunkMethodnEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id")
chunk_method: ChunkMethodEnum = Field(default=ChunkMethodEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id")
parser_config: ParserConfig | None = Field(default=None) parser_config: ParserConfig | None = Field(default=None)


@field_validator("avatar") @field_validator("avatar")

+ 1
- 1
deepdoc/parser/docx_parser.py 查看文件

max_type = max(max_type.items(), key=lambda x: x[1])[0] max_type = max(max_type.items(), key=lambda x: x[1])[0]


colnm = len(df.iloc[0, :]) colnm = len(df.iloc[0, :])
hdrows = [0] # header is not nessesarily appear in the first line
hdrows = [0] # header is not necessarily appear in the first line
if max_type == "Nu": if max_type == "Nu":
for r in range(1, len(df)): for r in range(1, len(df)):
tys = Counter([blockType(str(df.iloc[r, j])) tys = Counter([blockType(str(df.iloc[r, j]))

+ 1
- 1
deepdoc/parser/figure_parser.py 查看文件

from rag.prompts import vision_llm_figure_describe_prompt from rag.prompts import vision_llm_figure_describe_prompt




def vision_figure_parser_figure_data_wraper(figures_data_without_positions):
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
return [ return [
( (
(figure_data[1], [figure_data[0]]), (figure_data[1], [figure_data[0]]),

+ 5
- 5
deepdoc/parser/pdf_parser.py 查看文件

return fea return fea


@staticmethod @staticmethod
def sort_X_by_page(arr, threashold):
def sort_X_by_page(arr, threshold):
# sort using y1 first and then x1 # sort using y1 first and then x1
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
for i in range(len(arr) - 1): for i in range(len(arr) - 1):
for j in range(i, -1, -1): for j in range(i, -1, -1):
# restore the order using th # restore the order using th
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \
and arr[j + 1]["top"] < arr[j]["top"] \ and arr[j + 1]["top"] < arr[j]["top"] \
and arr[j + 1]["page_number"] == arr[j]["page_number"]: and arr[j + 1]["page_number"] == arr[j]["page_number"]:
tmp = arr[j] tmp = arr[j]
for b in self.boxes: for b in self.boxes:
if b.get("layout_type", "") != "table": if b.get("layout_type", "") != "table":
continue continue
ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
if ii is not None: if ii is not None:
b["R"] = ii b["R"] = ii
b["R_top"] = rows[ii]["top"] b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"] b["R_bott"] = rows[ii]["bottom"]


ii = Recognizer.find_overlapped_with_threashold(
ii = Recognizer.find_overlapped_with_threshold(
b, headers, thr=0.3) b, headers, thr=0.3)
if ii is not None: if ii is not None:
b["H_top"] = headers[ii]["top"] b["H_top"] = headers[ii]["top"]
b["C_left"] = clmns[ii]["x0"] b["C_left"] = clmns[ii]["x0"]
b["C_right"] = clmns[ii]["x1"] b["C_right"] = clmns[ii]["x1"]


ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
if ii is not None: if ii is not None:
b["H_top"] = spans[ii]["top"] b["H_top"] = spans[ii]["top"]
b["H_bott"] = spans[ii]["bottom"] b["H_bott"] = spans[ii]["bottom"]

+ 1
- 1
deepdoc/vision/layout_recognizer.py 查看文件

bxs.pop(i) bxs.pop(i)
continue continue


ii = self.find_overlapped_with_threashold(bxs[i], lts_,
ii = self.find_overlapped_with_threshold(bxs[i], lts_,
thr=0.4) thr=0.4)
if ii is None: # belong to nothing if ii is None: # belong to nothing
bxs[i]["layout_type"] = "" bxs[i]["layout_type"] = ""

+ 10
- 10
deepdoc/vision/recognizer.py 查看文件

self.label_list = label_list self.label_list = label_list


@staticmethod @staticmethod
def sort_Y_firstly(arr, threashold):
def sort_Y_firstly(arr, threshold):
def cmp(c1, c2): def cmp(c1, c2):
diff = c1["top"] - c2["top"] diff = c1["top"] - c2["top"]
if abs(diff) < threashold:
if abs(diff) < threshold:
diff = c1["x0"] - c2["x0"] diff = c1["x0"] - c2["x0"]
return diff return diff
arr = sorted(arr, key=cmp_to_key(cmp)) arr = sorted(arr, key=cmp_to_key(cmp))
return arr return arr


@staticmethod @staticmethod
def sort_X_firstly(arr, threashold):
def sort_X_firstly(arr, threshold):
def cmp(c1, c2): def cmp(c1, c2):
diff = c1["x0"] - c2["x0"] diff = c1["x0"] - c2["x0"]
if abs(diff) < threashold:
if abs(diff) < threshold:
diff = c1["top"] - c2["top"] diff = c1["top"] - c2["top"]
return diff return diff
arr = sorted(arr, key=cmp_to_key(cmp)) arr = sorted(arr, key=cmp_to_key(cmp))
e -= 1 e -= 1
break break


max_overlaped_i, max_overlaped = None, 0
max_overlapped_i, max_overlapped = None, 0
for i in range(s, e): for i in range(s, e):
ov = Recognizer.overlapped_area(bxs[i], box) ov = Recognizer.overlapped_area(bxs[i], box)
if ov <= max_overlaped:
if ov <= max_overlapped:
continue continue
max_overlaped_i = i
max_overlaped = ov
max_overlapped_i = i
max_overlapped = ov


return max_overlaped_i
return max_overlapped_i


@staticmethod @staticmethod
def find_horizontally_tightest_fit(box, boxes): def find_horizontally_tightest_fit(box, boxes):
return min_i return min_i


@staticmethod @staticmethod
def find_overlapped_with_threashold(box, boxes, thr=0.3):
def find_overlapped_with_threshold(box, boxes, thr=0.3):
if not boxes: if not boxes:
return return
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0

+ 3
- 3
deepdoc/vision/t_recognizer.py 查看文件

clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5) clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)


for b in boxes: for b in boxes:
ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
if ii is not None: if ii is not None:
b["R"] = ii b["R"] = ii
b["R_top"] = rows[ii]["top"] b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"] b["R_bott"] = rows[ii]["bottom"]


ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, headers, thr=0.3)
if ii is not None: if ii is not None:
b["H_top"] = headers[ii]["top"] b["H_top"] = headers[ii]["top"]
b["H_bott"] = headers[ii]["bottom"] b["H_bott"] = headers[ii]["bottom"]
b["C_left"] = clmns[ii]["x0"] b["C_left"] = clmns[ii]["x0"]
b["C_right"] = clmns[ii]["x1"] b["C_right"] = clmns[ii]["x1"]


ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
if ii is not None: if ii is not None:
b["H_top"] = spans[ii]["top"] b["H_top"] = spans[ii]["top"]
b["H_bott"] = spans[ii]["bottom"] b["H_bott"] = spans[ii]["bottom"]

+ 2
- 2
rag/app/naive.py 查看文件

from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table


sections, tables = Docx()(filename, binary) sections, tables = Docx()(filename, binary)


if vision_model: if vision_model:
figures_data = vision_figure_parser_figure_data_wraper(sections)
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try: try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback) boosted_figures = docx_vision_parser(callback=callback)

+ 2
- 2
rag/svr/task_executor.py 查看文件

import threading import threading
import time import time


from api.utils.log_utils import initRootLogger, get_project_base_directory
from api.utils.log_utils import init_root_logger, get_project_base_directory
from graphrag.general.index import run_graphrag from graphrag.general.index import run_graphrag
from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
from rag.prompts import keyword_extraction, question_proposal, content_tagging from rag.prompts import keyword_extraction, question_proposal, content_tagging


if __name__ == "__main__": if __name__ == "__main__":
faulthandler.enable() faulthandler.enable()
initRootLogger(CONSUMER_NAME)
init_root_logger(CONSUMER_NAME)
trio.run(main) trio.run(main)

Loading…
取消
儲存