### What problem does this PR solve? Add get_txt function to reduce duplicate code ### Type of change - [x] Refactoring --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>tags/v0.12.0
| @@ -10,28 +10,18 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from deepdoc.parser.utils import get_txt | |||
| from rag.nlp import num_tokens_from_string | |||
| from rag.nlp import find_codec,num_tokens_from_string | |||
| import re | |||
| class RAGFlowTxtParser: | |||
| def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(fnm, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_txt(fnm, binary) | |||
| return self.parser_txt(txt, chunk_token_num, delimiter) | |||
| @classmethod | |||
| def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): | |||
| if type(txt) != str: | |||
| if not isinstance(txt, str): | |||
| raise TypeError("txt type should be str!") | |||
| cks = [""] | |||
| tk_nums = [0] | |||
| @@ -0,0 +1,29 @@ | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from rag.nlp import find_codec | |||
| def get_txt(fnm: str, binary=None) -> str: | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(fnm, "r") as f: | |||
| while True: | |||
| line = f.readline() | |||
| if not line: | |||
| break | |||
| txt += line | |||
| return txt | |||
| @@ -15,6 +15,7 @@ from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | |||
| tokenize_chunks, find_codec | |||
| @@ -88,17 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_text(filename, binary) | |||
| sections = txt.split("\n") | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| @@ -17,6 +17,7 @@ from io import BytesIO | |||
| from docx import Document | |||
| from api.db import ParserType | |||
| from deepdoc.parser.utils import get_txt | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||
| from rag.nlp import rag_tokenizer | |||
| @@ -165,17 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_txt(filename, binary) | |||
| sections = txt.split("\n") | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| @@ -169,7 +169,6 @@ class Markdown(MarkdownParser): | |||
| return sections, tbls | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| lang="Chinese", callback=None, **kwargs): | |||
| """ | |||
| @@ -190,7 +189,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| res = [] | |||
| pdf_parser = None | |||
| sections = [] | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Docx()(filename, binary) | |||
| @@ -222,13 +220,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = ExcelParser() | |||
| if parser_config.get("html4excel"): | |||
| sections = [(l, "") for l in excel_parser.html(binary, 12) if l] | |||
| sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] | |||
| else: | |||
| sections = [(l, "") for l in excel_parser(binary) if l] | |||
| sections = [(_, "") for _ in excel_parser(binary) if _] | |||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = TxtParser()(filename,binary, | |||
| sections = TxtParser()(filename, binary, | |||
| parser_config.get("chunk_token_num", 128), | |||
| parser_config.get("delimiter", "\n!?;。;!?")) | |||
| callback(0.8, "Finish parsing.") | |||
| @@ -242,13 +240,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = HtmlParser()(filename, binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| sections = [(_, "") for _ in sections if _] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.json$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) | |||
| sections = [(l, "") for l in sections if l] | |||
| sections = [(_, "") for _ in sections if _] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| @@ -256,7 +254,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| sections = [(_, "") for _ in sections if _] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| @@ -13,6 +13,8 @@ | |||
| from tika import parser | |||
| from io import BytesIO | |||
| import re | |||
| from deepdoc.parser.utils import get_txt | |||
| from rag.app import laws | |||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | |||
| @@ -82,17 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_txt(filename, binary) | |||
| sections = txt.split("\n") | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| @@ -16,6 +16,8 @@ from io import BytesIO | |||
| from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from deepdoc.parser.utils import get_txt | |||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | |||
| from rag.settings import cron_logger | |||
| @@ -305,17 +307,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| return res | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_txt(filename, binary) | |||
| lines = txt.split("\n") | |||
| comma, tab = 0, 0 | |||
| for l in lines: | |||
| @@ -358,17 +350,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| return res | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_txt(filename, binary) | |||
| lines = txt.split("\n") | |||
| last_question, last_answer = "", "" | |||
| question_stack, level_stack = [], [] | |||
| @@ -20,6 +20,7 @@ from openpyxl import load_workbook | |||
| from dateutil.parser import parse as datetime_parse | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||
| from deepdoc.parser import ExcelParser | |||
| @@ -146,17 +147,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, | |||
| callback=callback) | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| else: | |||
| with open(filename, "r") as f: | |||
| while True: | |||
| l = f.readline() | |||
| if not l: | |||
| break | |||
| txt += l | |||
| txt = get_text(filename, binary) | |||
| lines = txt.split("\n") | |||
| fails = [] | |||
| headers = lines[0].split(kwargs.get("delimiter", "\t")) | |||