You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

knowledge_graph.py 1.3KB

1234567891011121314151617181920212223242526272829303132
  1. import re
  2. from graphrag.index import build_knowledge_graph_chunks
  3. from rag.app import naive
  4. from rag.nlp import rag_tokenizer, tokenize_chunks
  5. def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
  6. lang="Chinese", callback=None, **kwargs):
  7. parser_config = kwargs.get(
  8. "parser_config", {
  9. "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": True})
  10. eng = lang.lower() == "english"
  11. parser_config["layout_recognize"] = True
  12. sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
  13. parser_config=parser_config, callback=callback)
  14. chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
  15. parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
  16. )
  17. for c in chunks:
  18. c["docnm_kwd"] = filename
  19. doc = {
  20. "docnm_kwd": filename,
  21. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
  22. "knowledge_graph_kwd": "text"
  23. }
  24. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  25. chunks.extend(tokenize_chunks(sections, doc, eng))
  26. return chunks