### What problem does this PR solve? Fix typo in code ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>tags/v0.19.1
| {"role": "user", "content": "Hello!"}], gen_conf={}) | {"role": "user", "content": "Hello!"}], gen_conf={}) | ||||
| if msg.find("ERROR: ") == 0: | if msg.find("ERROR: ") == 0: | ||||
| logging.error( | logging.error( | ||||
| "'{}' dosen't work. {}".format( | |||||
| "'{}' doesn't work. {}".format( | |||||
| tenant["llm_id"], | tenant["llm_id"], | ||||
| msg)) | msg)) | ||||
| embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) | embd_mdl = LLMBundle(tenant["id"], LLMType.EMBEDDING, tenant["embd_id"]) | ||||
| v, c = embd_mdl.encode(["Hello!"]) | v, c = embd_mdl.encode(["Hello!"]) | ||||
| if c == 0: | if c == 0: | ||||
| logging.error( | logging.error( | ||||
| "'{}' dosen't work!".format( | |||||
| "'{}' doesn't work!".format( | |||||
| tenant["embd_id"])) | tenant["embd_id"])) | ||||
| User.nickname, | User.nickname, | ||||
| User.avatar.alias('tenant_avatar'), | User.avatar.alias('tenant_avatar'), | ||||
| ] | ] | ||||
| angents = cls.model.select(*fields) \ | |||||
| agents = cls.model.select(*fields) \ | |||||
| .join(User, on=(cls.model.user_id == User.id)) \ | .join(User, on=(cls.model.user_id == User.id)) \ | ||||
| .where(cls.model.id == pid) | .where(cls.model.id == pid) | ||||
| # obj = cls.model.query(id=pid)[0] | # obj = cls.model.query(id=pid)[0] | ||||
| return True, angents.dicts()[0] | |||||
| return True, agents.dicts()[0] | |||||
| except Exception as e: | except Exception as e: | ||||
| print(e) | print(e) | ||||
| return False, None | return False, None | ||||
| cls.model.update_time | cls.model.update_time | ||||
| ] | ] | ||||
| if keywords: | if keywords: | ||||
| angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( | |||||
| agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( | |||||
| ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == | ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == | ||||
| TenantPermission.TEAM.value)) | ( | TenantPermission.TEAM.value)) | ( | ||||
| cls.model.user_id == user_id)), | cls.model.user_id == user_id)), | ||||
| (fn.LOWER(cls.model.title).contains(keywords.lower())) | (fn.LOWER(cls.model.title).contains(keywords.lower())) | ||||
| ) | ) | ||||
| else: | else: | ||||
| angents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( | |||||
| agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where( | |||||
| ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == | ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission == | ||||
| TenantPermission.TEAM.value)) | ( | TenantPermission.TEAM.value)) | ( | ||||
| cls.model.user_id == user_id)) | cls.model.user_id == user_id)) | ||||
| ) | ) | ||||
| if desc: | if desc: | ||||
| angents = angents.order_by(cls.model.getter_by(orderby).desc()) | |||||
| agents = agents.order_by(cls.model.getter_by(orderby).desc()) | |||||
| else: | else: | ||||
| angents = angents.order_by(cls.model.getter_by(orderby).asc()) | |||||
| count = angents.count() | |||||
| angents = angents.paginate(page_number, items_per_page) | |||||
| return list(angents.dicts()), count | |||||
| agents = agents.order_by(cls.model.getter_by(orderby).asc()) | |||||
| count = agents.count() | |||||
| agents = agents.paginate(page_number, items_per_page) | |||||
| return list(agents.dicts()), count | |||||
| def completion(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): | def completion(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): | 
| # from beartype.claw import beartype_all # <-- you didn't sign up for this | # from beartype.claw import beartype_all # <-- you didn't sign up for this | ||||
| # beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code | # beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code | ||||
| from api.utils.log_utils import initRootLogger | |||||
| from api.utils.log_utils import init_root_logger | |||||
| from plugin import GlobalPluginManager | from plugin import GlobalPluginManager | ||||
| initRootLogger("ragflow_server") | |||||
| init_root_logger("ragflow_server") | |||||
| import logging | import logging | ||||
| import os | import os | 
| if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): | if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): | ||||
| return FileType.DOC.value | return FileType.DOC.value | ||||
| if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename): | |||||
| if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename): | |||||
| return FileType.AURAL.value | return FileType.AURAL.value | ||||
| if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): | if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename): | 
| ) | ) | ||||
| return PROJECT_BASE | return PROJECT_BASE | ||||
| def initRootLogger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"): | |||||
| def init_root_logger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"): | |||||
| global initialized_root_logger | global initialized_root_logger | ||||
| if initialized_root_logger: | if initialized_root_logger: | ||||
| return | return | 
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| pswd = crypt(sys.argv[1]) | |||||
| print(pswd) | |||||
| print(decrypt(pswd)) | |||||
| passwd = crypt(sys.argv[1]) | |||||
| print(passwd) | |||||
| print(decrypt(passwd)) | 
| team = auto() | team = auto() | ||||
| class ChunkMethodnEnum(StrEnum): | |||||
| class ChunkMethodEnum(StrEnum): | |||||
| naive = auto() | naive = auto() | ||||
| book = auto() | book = auto() | ||||
| email = auto() | email = auto() | ||||
| description: str | None = Field(default=None, max_length=65535) | description: str | None = Field(default=None, max_length=65535) | ||||
| embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")] | embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")] | ||||
| permission: PermissionEnum = Field(default=PermissionEnum.me, min_length=1, max_length=16) | permission: PermissionEnum = Field(default=PermissionEnum.me, min_length=1, max_length=16) | ||||
| chunk_method: ChunkMethodnEnum = Field(default=ChunkMethodnEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id") | |||||
| chunk_method: ChunkMethodEnum = Field(default=ChunkMethodEnum.naive, min_length=1, max_length=32, serialization_alias="parser_id") | |||||
| parser_config: ParserConfig | None = Field(default=None) | parser_config: ParserConfig | None = Field(default=None) | ||||
| @field_validator("avatar") | @field_validator("avatar") | 
| max_type = max(max_type.items(), key=lambda x: x[1])[0] | max_type = max(max_type.items(), key=lambda x: x[1])[0] | ||||
| colnm = len(df.iloc[0, :]) | colnm = len(df.iloc[0, :]) | ||||
| hdrows = [0] # header is not nessesarily appear in the first line | |||||
| hdrows = [0] # header is not necessarily appear in the first line | |||||
| if max_type == "Nu": | if max_type == "Nu": | ||||
| for r in range(1, len(df)): | for r in range(1, len(df)): | ||||
| tys = Counter([blockType(str(df.iloc[r, j])) | tys = Counter([blockType(str(df.iloc[r, j])) | 
| from rag.prompts import vision_llm_figure_describe_prompt | from rag.prompts import vision_llm_figure_describe_prompt | ||||
| def vision_figure_parser_figure_data_wraper(figures_data_without_positions): | |||||
| def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): | |||||
| return [ | return [ | ||||
| ( | ( | ||||
| (figure_data[1], [figure_data[0]]), | (figure_data[1], [figure_data[0]]), | 
| return fea | return fea | ||||
| @staticmethod | @staticmethod | ||||
| def sort_X_by_page(arr, threashold): | |||||
| def sort_X_by_page(arr, threshold): | |||||
| # sort using y1 first and then x1 | # sort using y1 first and then x1 | ||||
| arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) | arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) | ||||
| for i in range(len(arr) - 1): | for i in range(len(arr) - 1): | ||||
| for j in range(i, -1, -1): | for j in range(i, -1, -1): | ||||
| # restore the order using th | # restore the order using th | ||||
| if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ | |||||
| if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \ | |||||
| and arr[j + 1]["top"] < arr[j]["top"] \ | and arr[j + 1]["top"] < arr[j]["top"] \ | ||||
| and arr[j + 1]["page_number"] == arr[j]["page_number"]: | and arr[j + 1]["page_number"] == arr[j]["page_number"]: | ||||
| tmp = arr[j] | tmp = arr[j] | ||||
| for b in self.boxes: | for b in self.boxes: | ||||
| if b.get("layout_type", "") != "table": | if b.get("layout_type", "") != "table": | ||||
| continue | continue | ||||
| ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3) | |||||
| ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3) | |||||
| if ii is not None: | if ii is not None: | ||||
| b["R"] = ii | b["R"] = ii | ||||
| b["R_top"] = rows[ii]["top"] | b["R_top"] = rows[ii]["top"] | ||||
| b["R_bott"] = rows[ii]["bottom"] | b["R_bott"] = rows[ii]["bottom"] | ||||
| ii = Recognizer.find_overlapped_with_threashold( | |||||
| ii = Recognizer.find_overlapped_with_threshold( | |||||
| b, headers, thr=0.3) | b, headers, thr=0.3) | ||||
| if ii is not None: | if ii is not None: | ||||
| b["H_top"] = headers[ii]["top"] | b["H_top"] = headers[ii]["top"] | ||||
| b["C_left"] = clmns[ii]["x0"] | b["C_left"] = clmns[ii]["x0"] | ||||
| b["C_right"] = clmns[ii]["x1"] | b["C_right"] = clmns[ii]["x1"] | ||||
| ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3) | |||||
| ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3) | |||||
| if ii is not None: | if ii is not None: | ||||
| b["H_top"] = spans[ii]["top"] | b["H_top"] = spans[ii]["top"] | ||||
| b["H_bott"] = spans[ii]["bottom"] | b["H_bott"] = spans[ii]["bottom"] | 
| bxs.pop(i) | bxs.pop(i) | ||||
| continue | continue | ||||
| ii = self.find_overlapped_with_threashold(bxs[i], lts_, | |||||
| ii = self.find_overlapped_with_threshold(bxs[i], lts_, | |||||
| thr=0.4) | thr=0.4) | ||||
| if ii is None: # belong to nothing | if ii is None: # belong to nothing | ||||
| bxs[i]["layout_type"] = "" | bxs[i]["layout_type"] = "" | 
| self.label_list = label_list | self.label_list = label_list | ||||
| @staticmethod | @staticmethod | ||||
| def sort_Y_firstly(arr, threashold): | |||||
| def sort_Y_firstly(arr, threshold): | |||||
| def cmp(c1, c2): | def cmp(c1, c2): | ||||
| diff = c1["top"] - c2["top"] | diff = c1["top"] - c2["top"] | ||||
| if abs(diff) < threashold: | |||||
| if abs(diff) < threshold: | |||||
| diff = c1["x0"] - c2["x0"] | diff = c1["x0"] - c2["x0"] | ||||
| return diff | return diff | ||||
| arr = sorted(arr, key=cmp_to_key(cmp)) | arr = sorted(arr, key=cmp_to_key(cmp)) | ||||
| return arr | return arr | ||||
| @staticmethod | @staticmethod | ||||
| def sort_X_firstly(arr, threashold): | |||||
| def sort_X_firstly(arr, threshold): | |||||
| def cmp(c1, c2): | def cmp(c1, c2): | ||||
| diff = c1["x0"] - c2["x0"] | diff = c1["x0"] - c2["x0"] | ||||
| if abs(diff) < threashold: | |||||
| if abs(diff) < threshold: | |||||
| diff = c1["top"] - c2["top"] | diff = c1["top"] - c2["top"] | ||||
| return diff | return diff | ||||
| arr = sorted(arr, key=cmp_to_key(cmp)) | arr = sorted(arr, key=cmp_to_key(cmp)) | ||||
| e -= 1 | e -= 1 | ||||
| break | break | ||||
| max_overlaped_i, max_overlaped = None, 0 | |||||
| max_overlapped_i, max_overlapped = None, 0 | |||||
| for i in range(s, e): | for i in range(s, e): | ||||
| ov = Recognizer.overlapped_area(bxs[i], box) | ov = Recognizer.overlapped_area(bxs[i], box) | ||||
| if ov <= max_overlaped: | |||||
| if ov <= max_overlapped: | |||||
| continue | continue | ||||
| max_overlaped_i = i | |||||
| max_overlaped = ov | |||||
| max_overlapped_i = i | |||||
| max_overlapped = ov | |||||
| return max_overlaped_i | |||||
| return max_overlapped_i | |||||
| @staticmethod | @staticmethod | ||||
| def find_horizontally_tightest_fit(box, boxes): | def find_horizontally_tightest_fit(box, boxes): | ||||
| return min_i | return min_i | ||||
| @staticmethod | @staticmethod | ||||
| def find_overlapped_with_threashold(box, boxes, thr=0.3): | |||||
| def find_overlapped_with_threshold(box, boxes, thr=0.3): | |||||
| if not boxes: | if not boxes: | ||||
| return | return | ||||
| max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | 
| clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5) | clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5) | ||||
| for b in boxes: | for b in boxes: | ||||
| ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3) | |||||
| ii = LayoutRecognizer.find_overlapped_with_threshold(b, rows, thr=0.3) | |||||
| if ii is not None: | if ii is not None: | ||||
| b["R"] = ii | b["R"] = ii | ||||
| b["R_top"] = rows[ii]["top"] | b["R_top"] = rows[ii]["top"] | ||||
| b["R_bott"] = rows[ii]["bottom"] | b["R_bott"] = rows[ii]["bottom"] | ||||
| ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3) | |||||
| ii = LayoutRecognizer.find_overlapped_with_threshold(b, headers, thr=0.3) | |||||
| if ii is not None: | if ii is not None: | ||||
| b["H_top"] = headers[ii]["top"] | b["H_top"] = headers[ii]["top"] | ||||
| b["H_bott"] = headers[ii]["bottom"] | b["H_bott"] = headers[ii]["bottom"] | ||||
| b["C_left"] = clmns[ii]["x0"] | b["C_left"] = clmns[ii]["x0"] | ||||
| b["C_right"] = clmns[ii]["x1"] | b["C_right"] = clmns[ii]["x1"] | ||||
| ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3) | |||||
| ii = LayoutRecognizer.find_overlapped_with_threshold(b, spans, thr=0.3) | |||||
| if ii is not None: | if ii is not None: | ||||
| b["H_top"] = spans[ii]["top"] | b["H_top"] = spans[ii]["top"] | ||||
| b["H_bott"] = spans[ii]["bottom"] | b["H_bott"] = spans[ii]["bottom"] | 
| from api.db import LLMType | from api.db import LLMType | ||||
| from api.db.services.llm_service import LLMBundle | from api.db.services.llm_service import LLMBundle | ||||
| from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | ||||
| from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper | |||||
| from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper | |||||
| from deepdoc.parser.pdf_parser import PlainParser, VisionParser | from deepdoc.parser.pdf_parser import PlainParser, VisionParser | ||||
| from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table | from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table | ||||
| sections, tables = Docx()(filename, binary) | sections, tables = Docx()(filename, binary) | ||||
| if vision_model: | if vision_model: | ||||
| figures_data = vision_figure_parser_figure_data_wraper(sections) | |||||
| figures_data = vision_figure_parser_figure_data_wrapper(sections) | |||||
| try: | try: | ||||
| docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) | docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) | ||||
| boosted_figures = docx_vision_parser(callback=callback) | boosted_figures = docx_vision_parser(callback=callback) | 
| import threading | import threading | ||||
| import time | import time | ||||
| from api.utils.log_utils import initRootLogger, get_project_base_directory | |||||
| from api.utils.log_utils import init_root_logger, get_project_base_directory | |||||
| from graphrag.general.index import run_graphrag | from graphrag.general.index import run_graphrag | ||||
| from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache | from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache | ||||
| from rag.prompts import keyword_extraction, question_proposal, content_tagging | from rag.prompts import keyword_extraction, question_proposal, content_tagging | ||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| faulthandler.enable() | faulthandler.enable() | ||||
| initRootLogger(CONSUMER_NAME) | |||||
| init_root_logger(CONSUMER_NAME) | |||||
| trio.run(main) | trio.run(main) |