### What problem does this PR solve? #1594 ### Type of change - [x] Refactoringtags/v0.9.0
| @@ -21,6 +21,7 @@ from typing import List | |||
| import networkx as nx | |||
| from api.db import LLMType | |||
| from api.db.services.llm_service import LLMBundle | |||
| from api.db.services.user_service import TenantService | |||
| from graphrag.community_reports_extractor import CommunityReportsExtractor | |||
| from graphrag.entity_resolution import EntityResolution | |||
| from graphrag.graph_extractor import GraphExtractor | |||
| @@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string | |||
| def be_children(obj: dict, keyset:set): | |||
| if isinstance(obj, str): | |||
| obj = [obj] | |||
| if isinstance(obj, list): | |||
| for i in obj: keyset.add(i) | |||
| return [{"id": i, "children":[]} for i in obj] | |||
| arr = [] | |||
| for k,v in obj.items(): | |||
| k = re.sub(r"\*+", "", k) | |||
| @@ -65,7 +71,8 @@ def graph_merge(g1, g2): | |||
| def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]): | |||
| llm_bdl = LLMBundle(tenant_id, LLMType.CHAT) | |||
| _, tenant = TenantService.get_by_id(tenant_id) | |||
| llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id) | |||
| ext = GraphExtractor(llm_bdl) | |||
| left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024 | |||
| left_token_count = max(llm_bdl.max_length * 0.8, left_token_count) | |||
| @@ -13,7 +13,9 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import collections | |||
| import logging | |||
| import re | |||
| import logging | |||
| import traceback | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| @@ -65,7 +67,7 @@ class MindMapExtractor: | |||
| try: | |||
| exe = ThreadPoolExecutor(max_workers=12) | |||
| threads = [] | |||
| token_count = self._llm.max_length * 0.7 | |||
| token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512) | |||
| texts = [] | |||
| res = [] | |||
| cnt = 0 | |||
| @@ -122,6 +124,19 @@ class MindMapExtractor: | |||
| continue | |||
| return data | |||
| def _todict(self, layer:collections.OrderedDict): | |||
| to_ret = layer | |||
| if isinstance(layer, collections.OrderedDict): | |||
| to_ret = dict(layer) | |||
| try: | |||
| for key, value in to_ret.items(): | |||
| to_ret[key] = self._todict(value) | |||
| except AttributeError: | |||
| pass | |||
| return self._list_to_kv(to_ret) | |||
| def _process_document( | |||
| self, text: str, prompt_variables: dict[str, str] | |||
| ) -> str: | |||
| @@ -132,6 +147,7 @@ class MindMapExtractor: | |||
| text = perform_variable_replacements(self._mind_map_prompt, variables=variables) | |||
| gen_conf = {"temperature": 0.5} | |||
| response = self._llm.chat(text, [], gen_conf) | |||
| response = re.sub(r"```[^\n]*", "", response) | |||
| print(response) | |||
| print("---------------------------------------------------\n", markdown_to_json.dictify(response)) | |||
| return dict(markdown_to_json.dictify(response)) | |||
| print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response))) | |||
| return self._todict(markdown_to_json.dictify(response)) | |||
| @@ -14,28 +14,20 @@ | |||
| # limitations under the License. | |||
| # | |||
| MIND_MAP_EXTRACTION_PROMPT = """ | |||
| - Role: You're a talent text processor. | |||
| - Role: You're a talent text processor to summarize a piece of text into a mind map. | |||
| - Step of task: | |||
| 1. Generate a title for user's 'TEXT'。 | |||
| 2. Classify the 'TEXT' into sections as you see fit. | |||
| 3. If the subject matter is really complex, split them into sub-sections. | |||
| - Step of task: | |||
| 1. Generate a title for user's 'TEXT'。 | |||
| 2. Classify the 'TEXT' into sections of a mind map. | |||
| 3. If the subject matter is really complex, split them into sub-sections and sub-subsections. | |||
| 4. Add a shot content summary of the bottom level section. | |||
| - Output requirement: | |||
| - Always try to maximize the number of sub-sections. | |||
| - In language of 'Text' | |||
| - MUST IN FORMAT OF MARKDOWN | |||
| - Output requirement: | |||
| - In language of | |||
| - MUST IN FORMAT OF MARKDOWN | |||
| Output: | |||
| ## <Title> | |||
| <Section Name> | |||
| <Section Name> | |||
| <Subsection Name> | |||
| <Subsection Name> | |||
| <Section Name> | |||
| <Subsection Name> | |||
| -TEXT- | |||
| {input_text} | |||
| Output: | |||
| """ | |||