您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

html_parser.py 815B

123456789101112131415161718192021222324252627
  1. # -*- coding: utf-8 -*-
  2. from rag.nlp import find_codec
  3. import readability
  4. import html_text
  5. import chardet
  6. def get_encoding(file):
  7. with open(file,'rb') as f:
  8. tmp = chardet.detect(f.read())
  9. return tmp['encoding']
  10. class RAGFlowHtmlParser:
  11. def __call__(self, fnm, binary=None):
  12. txt = ""
  13. if binary:
  14. encoding = find_codec(binary)
  15. txt = binary.decode(encoding, errors="ignore")
  16. else:
  17. with open(fnm, "r",encoding=get_encoding(fnm)) as f:
  18. txt = f.read()
  19. html_doc = readability.Document(txt)
  20. title = html_doc.title()
  21. content = html_text.extract_text(html_doc.summary(html_partial=True))
  22. txt = f'{title}\n{content}'
  23. sections = txt.split("\n")
  24. return sections