### What problem does this PR solve? #1594 ### Type of change - [x] Refactoringtags/v0.9.0
| import networkx as nx | import networkx as nx | ||||
| from api.db import LLMType | from api.db import LLMType | ||||
| from api.db.services.llm_service import LLMBundle | from api.db.services.llm_service import LLMBundle | ||||
| from api.db.services.user_service import TenantService | |||||
| from graphrag.community_reports_extractor import CommunityReportsExtractor | from graphrag.community_reports_extractor import CommunityReportsExtractor | ||||
| from graphrag.entity_resolution import EntityResolution | from graphrag.entity_resolution import EntityResolution | ||||
| from graphrag.graph_extractor import GraphExtractor | from graphrag.graph_extractor import GraphExtractor | ||||
| def be_children(obj: dict, keyset:set): | def be_children(obj: dict, keyset:set): | ||||
| if isinstance(obj, str): | |||||
| obj = [obj] | |||||
| if isinstance(obj, list): | |||||
| for i in obj: keyset.add(i) | |||||
| return [{"id": i, "children":[]} for i in obj] | |||||
| arr = [] | arr = [] | ||||
| for k,v in obj.items(): | for k,v in obj.items(): | ||||
| k = re.sub(r"\*+", "", k) | k = re.sub(r"\*+", "", k) | ||||
| def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]): | def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]): | ||||
| llm_bdl = LLMBundle(tenant_id, LLMType.CHAT) | |||||
| _, tenant = TenantService.get_by_id(tenant_id) | |||||
| llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id) | |||||
| ext = GraphExtractor(llm_bdl) | ext = GraphExtractor(llm_bdl) | ||||
| left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024 | left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024 | ||||
| left_token_count = max(llm_bdl.max_length * 0.8, left_token_count) | left_token_count = max(llm_bdl.max_length * 0.8, left_token_count) |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import collections | |||||
| import logging | |||||
| import re | |||||
| import logging | import logging | ||||
| import traceback | import traceback | ||||
| from concurrent.futures import ThreadPoolExecutor | from concurrent.futures import ThreadPoolExecutor | ||||
| try: | try: | ||||
| exe = ThreadPoolExecutor(max_workers=12) | exe = ThreadPoolExecutor(max_workers=12) | ||||
| threads = [] | threads = [] | ||||
| token_count = self._llm.max_length * 0.7 | |||||
| token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512) | |||||
| texts = [] | texts = [] | ||||
| res = [] | res = [] | ||||
| cnt = 0 | cnt = 0 | ||||
| continue | continue | ||||
| return data | return data | ||||
| def _todict(self, layer:collections.OrderedDict): | |||||
| to_ret = layer | |||||
| if isinstance(layer, collections.OrderedDict): | |||||
| to_ret = dict(layer) | |||||
| try: | |||||
| for key, value in to_ret.items(): | |||||
| to_ret[key] = self._todict(value) | |||||
| except AttributeError: | |||||
| pass | |||||
| return self._list_to_kv(to_ret) | |||||
| def _process_document( | def _process_document( | ||||
| self, text: str, prompt_variables: dict[str, str] | self, text: str, prompt_variables: dict[str, str] | ||||
| ) -> str: | ) -> str: | ||||
| text = perform_variable_replacements(self._mind_map_prompt, variables=variables) | text = perform_variable_replacements(self._mind_map_prompt, variables=variables) | ||||
| gen_conf = {"temperature": 0.5} | gen_conf = {"temperature": 0.5} | ||||
| response = self._llm.chat(text, [], gen_conf) | response = self._llm.chat(text, [], gen_conf) | ||||
| response = re.sub(r"```[^\n]*", "", response) | |||||
| print(response) | print(response) | ||||
| print("---------------------------------------------------\n", markdown_to_json.dictify(response)) | |||||
| return dict(markdown_to_json.dictify(response)) | |||||
| print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response))) | |||||
| return self._todict(markdown_to_json.dictify(response)) |
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| MIND_MAP_EXTRACTION_PROMPT = """ | MIND_MAP_EXTRACTION_PROMPT = """ | ||||
| - Role: You're a talent text processor. | |||||
| - Role: You're a talent text processor to summarize a piece of text into a mind map. | |||||
| - Step of task: | |||||
| 1. Generate a title for user's 'TEXT'。 | |||||
| 2. Classify the 'TEXT' into sections as you see fit. | |||||
| 3. If the subject matter is really complex, split them into sub-sections. | |||||
| - Step of task: | |||||
| 1. Generate a title for user's 'TEXT'。 | |||||
| 2. Classify the 'TEXT' into sections of a mind map. | |||||
| 3. If the subject matter is really complex, split them into sub-sections and sub-subsections. | |||||
| 4. Add a shot content summary of the bottom level section. | |||||
| - Output requirement: | |||||
| - Always try to maximize the number of sub-sections. | |||||
| - In language of 'Text' | |||||
| - MUST IN FORMAT OF MARKDOWN | |||||
| - Output requirement: | |||||
| - In language of | |||||
| - MUST IN FORMAT OF MARKDOWN | |||||
| Output: | |||||
| ## <Title> | |||||
| <Section Name> | |||||
| <Section Name> | |||||
| <Subsection Name> | |||||
| <Subsection Name> | |||||
| <Section Name> | |||||
| <Subsection Name> | |||||
| -TEXT- | -TEXT- | ||||
| {input_text} | {input_text} | ||||
| Output: | |||||
| """ | """ |