You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html_parser.py 1.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # -*- coding: utf-8 -*-
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. #
  14. from rag.nlp import find_codec
  15. import readability
  16. import html_text
  17. import chardet
  18. def get_encoding(file):
  19. with open(file,'rb') as f:
  20. tmp = chardet.detect(f.read())
  21. return tmp['encoding']
  22. class RAGFlowHtmlParser:
  23. def __call__(self, fnm, binary=None):
  24. txt = ""
  25. if binary:
  26. encoding = find_codec(binary)
  27. txt = binary.decode(encoding, errors="ignore")
  28. else:
  29. with open(fnm, "r",encoding=get_encoding(fnm)) as f:
  30. txt = f.read()
  31. return self.parser_txt(txt)
  32. @classmethod
  33. def parser_txt(cls, txt):
  34. if not isinstance(txt, str):
  35. raise TypeError("txt type should be str!")
  36. html_doc = readability.Document(txt)
  37. title = html_doc.title()
  38. content = html_text.extract_text(html_doc.summary(html_partial=True))
  39. txt = f"{title}\n{content}"
  40. sections = txt.split("\n")
  41. return sections