| 123456789101112131415161718192021222324252627 | 
							- # -*- coding: utf-8 -*-
 - from rag.nlp import find_codec
 - import readability
 - import html_text
 - import chardet
 - 
 - def get_encoding(file):
 -     with open(file,'rb') as f:
 -         tmp = chardet.detect(f.read())
 -         return tmp['encoding']
 -     
 - class RAGFlowHtmlParser:
 -     def __call__(self, fnm, binary=None):
 -         txt = ""
 -         if binary:
 -             encoding = find_codec(binary)
 -             txt = binary.decode(encoding, errors="ignore")
 -         else:
 -             with open(fnm, "r",encoding=get_encoding(fnm)) as f:
 -                 txt = f.read()
 -             
 -         html_doc = readability.Document(txt)
 -         title = html_doc.title()
 -         content = html_text.extract_text(html_doc.summary(html_partial=True))
 -         txt = f'{title}\n{content}'
 -         sections = txt.split("\n")
 -         return sections
 
 
  |