Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

knowledge_graph.py 1.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import re
  17. from graphrag.index import build_knowledge_graph_chunks
  18. from rag.app import naive
  19. from rag.nlp import rag_tokenizer, tokenize_chunks
  20. def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
  21. lang="Chinese", callback=None, **kwargs):
  22. parser_config = kwargs.get(
  23. "parser_config", {
  24. "chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
  25. eng = lang.lower() == "english"
  26. parser_config["layout_recognize"] = True
  27. sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
  28. parser_config=parser_config, callback=callback)
  29. chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
  30. parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
  31. )
  32. for c in chunks:
  33. c["docnm_kwd"] = filename
  34. doc = {
  35. "docnm_kwd": filename,
  36. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
  37. "knowledge_graph_kwd": "text"
  38. }
  39. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  40. chunks.extend(tokenize_chunks(sections, doc, eng))
  41. return chunks