| 1234567891011121314151617181920212223242526272829303132333435363738394041424344 | 
							- # -*- coding: utf-8 -*-
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - from rag.nlp import find_codec
 - import readability
 - import html_text
 - import chardet
 - 
 - def get_encoding(file):
 -     with open(file,'rb') as f:
 -         tmp = chardet.detect(f.read())
 -         return tmp['encoding']
 - 
 - class RAGFlowHtmlParser:
 -     def __call__(self, fnm, binary=None):
 -         txt = ""
 -         if binary:
 -             encoding = find_codec(binary)
 -             txt = binary.decode(encoding, errors="ignore")
 -         else:
 -             with open(fnm, "r",encoding=get_encoding(fnm)) as f:
 -                 txt = f.read()
 -         return self.parser_txt(txt)
 - 
 -     @classmethod
 -     def parser_txt(cls, txt):
 -         if type(txt) != str:
 -             raise TypeError("txt type should be str!")
 -         html_doc = readability.Document(txt)
 -         title = html_doc.title()
 -         content = html_text.extract_text(html_doc.summary(html_partial=True))
 -         txt = f"{title}\n{content}"
 -         sections = txt.split("\n")
 -         return sections
 
 
  |