### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] Refactoring --------- Signed-off-by: jinhai <haijin.chn@gmail.com>tags/v0.15.0
| from api.db.init_data import init_web_data | from api.db.init_data import init_web_data | ||||
| from api.versions import get_ragflow_version | from api.versions import get_ragflow_version | ||||
| from api.utils import show_configs | from api.utils import show_configs | ||||
| from rag.settings import print_rag_settings | |||||
| def update_progress(): | def update_progress(): | ||||
| ) | ) | ||||
| show_configs() | show_configs() | ||||
| settings.init_settings() | settings.init_settings() | ||||
| print_rag_settings() | |||||
| # init db | # init db | ||||
| init_web_db() | init_web_db() | 
| class Pdf(PdfParser): | class Pdf(PdfParser): | ||||
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| callback(msg="OCR is running...") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| from_page, | from_page, | ||||
| to_page, | to_page, | ||||
| callback) | callback) | ||||
| callback(msg="OCR finished") | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin) | self._layouts_rec(zoomin) | ||||
| callback(0.67, "Layout analysis finished") | |||||
| callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug("layouts: {}".format(timer() - start)) | logging.debug("layouts: {}".format(timer() - start)) | ||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.68, "Table analysis finished") | |||||
| callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| self._naive_vertical_merge() | self._naive_vertical_merge() | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| self._merge_with_same_bullet() | self._merge_with_same_bullet() | ||||
| callback(0.75, "Text merging finished.") | |||||
| callback(0.8, "Text extraction finished") | |||||
| callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) | |||||
| return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) | return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) | ||||
| for b in self.boxes], tbls | for b in self.boxes], tbls | 
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| callback(msg="OCR is running...") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished") | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin) | self._layouts_rec(zoomin) | ||||
| callback(0.67, "Layout analysis finished") | |||||
| callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug("layouts:".format( | logging.debug("layouts:".format( | ||||
| )) | )) | ||||
| self._naive_vertical_merge() | self._naive_vertical_merge() | ||||
| callback(0.8, "Text extraction finished") | |||||
| callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) | |||||
| return [(b["text"], self._line_tag(b, zoomin)) | return [(b["text"], self._line_tag(b, zoomin)) | ||||
| for b in self.boxes], None | for b in self.boxes], None | 
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| start = timer() | start = timer() | ||||
| callback(msg="OCR is running...") | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished.") | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| # for bb in self.boxes: | # for bb in self.boxes: | ||||
| # for b in bb: | # for b in bb: | ||||
| # print(b) | # print(b) | ||||
| logging.debug("OCR: {}".format(timer() - start)) | logging.debug("OCR: {}".format(timer() - start)) | ||||
| start = timer() | |||||
| self._layouts_rec(zoomin) | self._layouts_rec(zoomin) | ||||
| callback(0.65, "Layout analysis finished.") | |||||
| callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug("layouts: {}".format(timer() - start)) | logging.debug("layouts: {}".format(timer() - start)) | ||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.67, "Table analysis finished.") | |||||
| callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| self._concat_downward() | self._concat_downward() | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback(0.68, "Text merging finished") | |||||
| callback(0.68, "Text merged ({:.2f}s)".format(timer() - start)) | |||||
| # clean mess | # clean mess | ||||
| for b in self.boxes: | for b in self.boxes: | 
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| start = timer() | start = timer() | ||||
| callback(msg="OCR is running...") | |||||
| first_start = start | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished") | |||||
| logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin) | self._layouts_rec(zoomin) | ||||
| callback(0.63, "Layout analysis finished.") | |||||
| callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.65, "Table analysis finished.") | |||||
| callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| callback(0.67, "Text merging finished") | |||||
| callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| # self._naive_vertical_merge() | # self._naive_vertical_merge() | ||||
| self._concat_downward() | self._concat_downward() | ||||
| # self._filter_forpages() | # self._filter_forpages() | ||||
| logging.info("layouts cost: {}s".format(timer() - start)) | |||||
| logging.info("layouts cost: {}s".format(timer() - first_start)) | |||||
| return [(b["text"], self._line_tag(b, zoomin)) | return [(b["text"], self._line_tag(b, zoomin)) | ||||
| for b in self.boxes], tbls | for b in self.boxes], tbls | ||||
| else: | else: | ||||
| if sections and sections[-1][0].strip().find("#") == 0: | if sections and sections[-1][0].strip().find("#") == 0: | ||||
| sec_, _ = sections.pop(-1) | sec_, _ = sections.pop(-1) | ||||
| sections.append((sec_+"\n"+sec, "")) | |||||
| sections.append((sec_ + "\n" + sec, "")) | |||||
| else: | else: | ||||
| sections.append((sec, "")) | sections.append((sec, "")) | ||||
| class Pdf(PdfParser): | class Pdf(PdfParser): | ||||
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| callback(msg="OCR is running...") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished") | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin, drop=False) | self._layouts_rec(zoomin, drop=False) | ||||
| callback(0.63, "Layout analysis finished.") | |||||
| callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug("layouts cost: {}s".format(timer() - start)) | logging.debug("layouts cost: {}s".format(timer() - start)) | ||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.65, "Table analysis finished.") | |||||
| callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| callback(0.67, "Text merging finished") | |||||
| callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| self._concat_downward() | self._concat_downward() | ||||
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| callback(msg="OCR is running...") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished.") | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin) | self._layouts_rec(zoomin) | ||||
| callback(0.63, "Layout analysis finished") | |||||
| callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug(f"layouts cost: {timer() - start}s") | logging.debug(f"layouts cost: {timer() - start}s") | ||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.68, "Table analysis finished") | |||||
| callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | column_width = np.median([b["x1"] - b["x0"] for b in self.boxes]) | ||||
| self._concat_downward() | self._concat_downward() | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback(0.75, "Text merging finished.") | |||||
| callback(0.75, "Text merged ({:.2f}s)".format(timer() - start)) | |||||
| # clean mess | # clean mess | ||||
| if column_width < self.page_images[0].size[0] / zoomin / 2: | if column_width < self.page_images[0].size[0] / zoomin / 2: | 
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| callback(msg="OCR is running...") | |||||
| from timeit import default_timer as timer | |||||
| start = timer() | |||||
| callback(msg="OCR started") | |||||
| self.__images__(filename if not binary else binary, | self.__images__(filename if not binary else binary, | ||||
| zoomin, from_page, to_page, callback) | zoomin, from_page, to_page, callback) | ||||
| callback(0.8, "Page {}~{}: OCR finished".format( | |||||
| from_page, min(to_page, self.total_page))) | |||||
| callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start)) | |||||
| assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( | assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( | ||||
| len(self.boxes), len(self.page_images)) | len(self.boxes), len(self.page_images)) | ||||
| res = [] | res = [] | 
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | to_page=100000, zoomin=3, callback=None): | ||||
| start = timer() | start = timer() | ||||
| callback(msg="OCR is running...") | |||||
| callback(msg="OCR started") | |||||
| self.__images__( | self.__images__( | ||||
| filename if not binary else binary, | filename if not binary else binary, | ||||
| zoomin, | zoomin, | ||||
| to_page, | to_page, | ||||
| callback | callback | ||||
| ) | ) | ||||
| callback(msg="OCR finished") | |||||
| logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start)) | |||||
| callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) | |||||
| logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) | |||||
| start = timer() | start = timer() | ||||
| self._layouts_rec(zoomin, drop=False) | self._layouts_rec(zoomin, drop=False) | ||||
| callback(0.63, "Layout analysis finished.") | |||||
| callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._table_transformer_job(zoomin) | self._table_transformer_job(zoomin) | ||||
| callback(0.65, "Table analysis finished.") | |||||
| callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) | |||||
| start = timer() | |||||
| self._text_merge() | self._text_merge() | ||||
| callback(0.67, "Text merging finished") | |||||
| callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | tbls = self._extract_table_figure(True, zoomin, True, True) | ||||
| #self._naive_vertical_merge() | #self._naive_vertical_merge() | ||||
| # self._concat_downward() | # self._concat_downward() | ||||
| sum_question = '\n'.join(question_stack) | sum_question = '\n'.join(question_stack) | ||||
| if sum_question: | if sum_question: | ||||
| qai_list.append((sum_question, last_answer, last_image)) | qai_list.append((sum_question, last_answer, last_image)) | ||||
| tbls = [] | tbls = [] | ||||
| for tb in self.doc.tables: | for tb in self.doc.tables: | ||||
| html= "<table>" | html= "<table>" | 
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import os | import os | ||||
| import logging | |||||
| from api.utils import get_base_config, decrypt_database_config | from api.utils import get_base_config, decrypt_database_config | ||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| SVR_QUEUE_MAX_LEN = 1024 | SVR_QUEUE_MAX_LEN = 1024 | ||||
| SVR_CONSUMER_NAME = "rag_flow_svr_consumer" | SVR_CONSUMER_NAME = "rag_flow_svr_consumer" | ||||
| SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group" | SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group" | ||||
| def print_rag_settings(): | |||||
| logging.info(f"MAX_CONTENT_LENGTH: {DOC_MAXIMUM_SIZE}") | |||||
| logging.info(f"SERVER_QUEUE_MAX_LEN: {SVR_QUEUE_MAX_LEN}") | |||||
| logging.info(f"SERVER_QUEUE_RETENTION: {SVR_QUEUE_RETENTION}") | |||||
| logging.info(f"MAX_FILE_COUNT_PER_USER: {int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))}") | 
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from api.db.services.file2document_service import File2DocumentService | from api.db.services.file2document_service import File2DocumentService | ||||
| from api import settings | from api import settings | ||||
| from api.versions import get_ragflow_version | |||||
| from api.db.db_models import close_connection | from api.db.db_models import close_connection | ||||
| from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \ | from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \ | ||||
| knowledge_graph, email | knowledge_graph, email | ||||
| from rag.nlp import search, rag_tokenizer | from rag.nlp import search, rag_tokenizer | ||||
| from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor | from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor | ||||
| from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME | |||||
| from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME, print_rag_settings | |||||
| from rag.utils import rmSpace, num_tokens_from_string | from rag.utils import rmSpace, num_tokens_from_string | ||||
| from rag.utils.redis_conn import REDIS_CONN, Payload | from rag.utils.redis_conn import REDIS_CONN, Payload | ||||
| from rag.utils.storage_factory import STORAGE_IMPL | from rag.utils.storage_factory import STORAGE_IMPL | ||||
| # TODO: exception handler | # TODO: exception handler | ||||
| ## set_progress(r["did"], -1, "ERROR: ") | ## set_progress(r["did"], -1, "ERROR: ") | ||||
| callback( | callback( | ||||
| msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks), | |||||
| timer() - st) | |||||
| msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st) | |||||
| ) | ) | ||||
| st = timer() | st = timer() | ||||
| try: | try: | ||||
| tk_count = 0 | tk_count = 0 | ||||
| raise | raise | ||||
| logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) | logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) | ||||
| callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st)) | |||||
| callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st)) | |||||
| # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") | # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") | ||||
| init_kb(r, vector_size) | init_kb(r, vector_size) | ||||
| chunk_count = len(set([c["id"] for c in cks])) | chunk_count = len(set([c["id"] for c in cks])) | ||||
| callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="") | callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="") | ||||
| logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st)) | logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st)) | ||||
| if es_r: | if es_r: | ||||
| callback(-1, "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!") | |||||
| callback(-1, | |||||
| "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!") | |||||
| settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) | settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) | ||||
| logging.error('Insert chunk error: ' + str(es_r)) | logging.error('Insert chunk error: ' + str(es_r)) | ||||
| raise Exception('Insert chunk error: ' + str(es_r)) | raise Exception('Insert chunk error: ' + str(es_r)) | ||||
| settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) | settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) | ||||
| return | return | ||||
| callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st)) | |||||
| callback(1., "Done!") | |||||
| callback(1., msg="Index cost {:.2f}s.".format(timer() - st)) | |||||
| DocumentService.increment_chunk_num( | DocumentService.increment_chunk_num( | ||||
| r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | r["doc_id"], r["kb_id"], tk_count, chunk_count, 0) | ||||
| logging.info( | logging.info( | ||||
| "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format( | "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format( | ||||
| r["id"], tk_count, len(cks), timer() - st)) | |||||
| r["id"], tk_count, len(cks), timer() - st)) | |||||
| def handle_task(): | def handle_task(): | ||||
| for stat in stats2[:10]: | for stat in stats2[:10]: | ||||
| msg += f"{stat}\n" | msg += f"{stat}\n" | ||||
| stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno') | stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno') | ||||
| msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id-1} to snapshot {snapshot_id}:\n" | |||||
| msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id - 1} to snapshot {snapshot_id}:\n" | |||||
| for stat in stats1_vs_2[:10]: | for stat in stats1_vs_2[:10]: | ||||
| msg += f"{stat}\n" | msg += f"{stat}\n" | ||||
| msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n" | msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n" | ||||
| def main(): | def main(): | ||||
| logging.info(r""" | |||||
| ______ __ ______ __ | |||||
| /_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____ | |||||
| / / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/ | |||||
| / / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / / | |||||
| /_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/ | |||||
| """) | |||||
| logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}') | |||||
| settings.init_settings() | settings.init_settings() | ||||
| print_rag_settings() | |||||
| background_thread = threading.Thread(target=report_status) | background_thread = threading.Thread(target=report_status) | ||||
| background_thread.daemon = True | background_thread.daemon = True | ||||
| background_thread.start() | background_thread.start() | ||||
| while True: | while True: | ||||
| handle_task() | handle_task() | ||||
| num_tasks = DONE_TASKS + FAILED_TASKS | num_tasks = DONE_TASKS + FAILED_TASKS | ||||
| if TRACE_MALLOC_DELTA> 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0: | |||||
| if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0: | |||||
| snapshot2 = tracemalloc.take_snapshot() | snapshot2 = tracemalloc.take_snapshot() | ||||
| analyze_heap(snapshot1, snapshot2, int(num_tasks/TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0) | |||||
| analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0) | |||||
| snapshot1 = snapshot2 | snapshot1 = snapshot2 | ||||
| snapshot2 = None | snapshot2 = None | ||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| main() | main() |