Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

html_parser.py 1.3KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. # -*- coding: utf-8 -*-
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. #
  14. from rag.nlp import find_codec
  15. import readability
  16. import html_text
  17. import chardet
  18. def get_encoding(file):
  19. with open(file,'rb') as f:
  20. tmp = chardet.detect(f.read())
  21. return tmp['encoding']
  22. class RAGFlowHtmlParser:
  23. def __call__(self, fnm, binary=None):
  24. txt = ""
  25. if binary:
  26. encoding = find_codec(binary)
  27. txt = binary.decode(encoding, errors="ignore")
  28. else:
  29. with open(fnm, "r",encoding=get_encoding(fnm)) as f:
  30. txt = f.read()
  31. html_doc = readability.Document(txt)
  32. title = html_doc.title()
  33. content = html_text.extract_text(html_doc.summary(html_partial=True))
  34. txt = f'{title}\n{content}'
  35. sections = txt.split("\n")
  36. return sections