### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring

1 年之前 · 3fd7db40ea
--- a/graphrag/index.py
+++ b/graphrag/index.py
 import networkx as nx
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from api.db.services.user_service import TenantService
 from graphrag.community_reports_extractor import CommunityReportsExtractor
 from graphrag.entity_resolution import EntityResolution
 from graphrag.graph_extractor import GraphExtractor
 def be_children(obj: dict, keyset:set):
    if isinstance(obj, str):
        obj = [obj]
    if isinstance(obj, list):
        for i in obj: keyset.add(i)
        return [{"id": i, "children":[]} for i in obj]
    arr = []
    for k,v in obj.items():
        k = re.sub(r"\*+", "", k)
 def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
    llm_bdl = LLMBundle(tenant_id, LLMType.CHAT)
    _, tenant = TenantService.get_by_id(tenant_id)
    llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
    ext = GraphExtractor(llm_bdl)
    left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
    left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)
--- a/graphrag/mind_map_extractor.py
+++ b/graphrag/mind_map_extractor.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import collections
 import logging
 import re
 import logging
 import traceback
 from concurrent.futures import ThreadPoolExecutor
        try:
            exe = ThreadPoolExecutor(max_workers=12)
            threads = []
            token_count = self._llm.max_length * 0.7
            token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
            texts = []
            res = []
            cnt = 0
                continue
        return data
    def _todict(self, layer:collections.OrderedDict):
        to_ret = layer
        if isinstance(layer, collections.OrderedDict):
            to_ret = dict(layer)
        try:
            for key, value in to_ret.items():
                to_ret[key] = self._todict(value)
        except AttributeError:
            pass
        return self._list_to_kv(to_ret)
    def _process_document(
            self, text: str, prompt_variables: dict[str, str]
    ) -> str:
        text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
        gen_conf = {"temperature": 0.5}
        response = self._llm.chat(text, [], gen_conf)
        response = re.sub(r"```[^\n]*", "", response)
        print(response)
        print("---------------------------------------------------\n", markdown_to_json.dictify(response))
        return dict(markdown_to_json.dictify(response))
        print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
        return self._todict(markdown_to_json.dictify(response))
--- a/graphrag/mind_map_prompt.py
+++ b/graphrag/mind_map_prompt.py
 #  limitations under the License.
 #
 MIND_MAP_EXTRACTION_PROMPT = """
 - Role: You're a talent text processor.
 - Role: You're a talent text processor to summarize a piece of text into a mind map.
 - Step of task:
   1. Generate a title for user's 'TEXT'。
   2. Classify the 'TEXT' into sections as you see fit.
   3. If the subject matter is really complex, split them into sub-sections. 
 - Step of task:
  1. Generate a title for user's 'TEXT'。
  2. Classify the 'TEXT' into sections of a mind map.
  3. If the subject matter is really complex, split them into sub-sections and sub-subsections. 
  4. Add a shot content summary of the bottom level section.
 - Output requirement:
  - Always try to maximize the number of sub-sections. 
  - In language of 'Text'
  - MUST IN FORMAT OF MARKDOWN
 - Output requirement:
   - In language of 
   - MUST IN FORMAT OF MARKDOWN
 Output:
 ## <Title>
  <Section Name>
  <Section Name>
    <Subsection Name>
    <Subsection Name>
  <Section Name>
    <Subsection Name>
 -TEXT-
 {input_text}
 Output:
 """