### What problem does this PR solve? Add get_txt function to reduce duplicate code ### Type of change - [x] Refactoring --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>tags/v0.12.0
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from rag.nlp import num_tokens_from_string | |||||
| from rag.nlp import find_codec,num_tokens_from_string | |||||
| import re | |||||
| class RAGFlowTxtParser: | class RAGFlowTxtParser: | ||||
| def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): | def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"): | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(fnm, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_txt(fnm, binary) | |||||
| return self.parser_txt(txt, chunk_token_num, delimiter) | return self.parser_txt(txt, chunk_token_num, delimiter) | ||||
| @classmethod | @classmethod | ||||
| def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): | def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): | ||||
| if type(txt) != str: | |||||
| if not isinstance(txt, str): | |||||
| raise TypeError("txt type should be str!") | raise TypeError("txt type should be str!") | ||||
| cks = [""] | cks = [""] | ||||
| tk_nums = [0] | tk_nums = [0] |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from rag.nlp import find_codec | |||||
| def get_txt(fnm: str, binary=None) -> str: | |||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(fnm, "r") as f: | |||||
| while True: | |||||
| line = f.readline() | |||||
| if not line: | |||||
| break | |||||
| txt += line | |||||
| return txt |
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from deepdoc.parser.utils import get_text | |||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | ||||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | ||||
| tokenize_chunks, find_codec | tokenize_chunks, find_codec | ||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | elif re.search(r"\.txt$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_text(filename, binary) | |||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [(l, "") for l in sections if l] | sections = [(l, "") for l in sections if l] | ||||
| remove_contents_table(sections, eng=is_english( | remove_contents_table(sections, eng=is_english( |
| from docx import Document | from docx import Document | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | ||||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | ||||
| from rag.nlp import rag_tokenizer | from rag.nlp import rag_tokenizer | ||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | elif re.search(r"\.txt$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_txt(filename, binary) | |||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [l for l in sections if l] | sections = [l for l in sections if l] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") |
| return sections, tbls | return sections, tbls | ||||
| def chunk(filename, binary=None, from_page=0, to_page=100000, | def chunk(filename, binary=None, from_page=0, to_page=100000, | ||||
| lang="Chinese", callback=None, **kwargs): | lang="Chinese", callback=None, **kwargs): | ||||
| """ | """ | ||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | ||||
| res = [] | res = [] | ||||
| pdf_parser = None | pdf_parser = None | ||||
| sections = [] | |||||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | if re.search(r"\.docx$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| sections, tbls = Docx()(filename, binary) | sections, tbls = Docx()(filename, binary) | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| excel_parser = ExcelParser() | excel_parser = ExcelParser() | ||||
| if parser_config.get("html4excel"): | if parser_config.get("html4excel"): | ||||
| sections = [(l, "") for l in excel_parser.html(binary, 12) if l] | |||||
| sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] | |||||
| else: | else: | ||||
| sections = [(l, "") for l in excel_parser(binary) if l] | |||||
| sections = [(_, "") for _ in excel_parser(binary) if _] | |||||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| sections = TxtParser()(filename,binary, | |||||
| sections = TxtParser()(filename, binary, | |||||
| parser_config.get("chunk_token_num", 128), | parser_config.get("chunk_token_num", 128), | ||||
| parser_config.get("delimiter", "\n!?;。;!?")) | parser_config.get("delimiter", "\n!?;。;!?")) | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| sections = HtmlParser()(filename, binary) | sections = HtmlParser()(filename, binary) | ||||
| sections = [(l, "") for l in sections if l] | |||||
| sections = [(_, "") for _ in sections if _] | |||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.json$", filename, re.IGNORECASE): | elif re.search(r"\.json$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) | sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary) | ||||
| sections = [(l, "") for l in sections if l] | |||||
| sections = [(_, "") for _ in sections if _] | |||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | elif re.search(r"\.doc$", filename, re.IGNORECASE): | ||||
| binary = BytesIO(binary) | binary = BytesIO(binary) | ||||
| doc_parsed = parser.from_buffer(binary) | doc_parsed = parser.from_buffer(binary) | ||||
| sections = doc_parsed['content'].split('\n') | sections = doc_parsed['content'].split('\n') | ||||
| sections = [(l, "") for l in sections if l] | |||||
| sections = [(_, "") for _ in sections if _] | |||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| else: | else: |
| from tika import parser | from tika import parser | ||||
| from io import BytesIO | from io import BytesIO | ||||
| import re | import re | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from rag.app import laws | from rag.app import laws | ||||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | from rag.nlp import rag_tokenizer, tokenize, find_codec | ||||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | ||||
| elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_txt(filename, binary) | |||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [s for s in sections if s] | sections = [s for s in sections if s] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") |
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| from nltk import word_tokenize | from nltk import word_tokenize | ||||
| from openpyxl import load_workbook | from openpyxl import load_workbook | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | ||||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | from rag.nlp import rag_tokenizer, tokenize_table, concat_img | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| return res | return res | ||||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_txt(filename, binary) | |||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| comma, tab = 0, 0 | comma, tab = 0, 0 | ||||
| for l in lines: | for l in lines: | ||||
| return res | return res | ||||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_txt(filename, binary) | |||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| last_question, last_answer = "", "" | last_question, last_answer = "", "" | ||||
| question_stack, level_stack = [], [] | question_stack, level_stack = [], [] |
| from dateutil.parser import parse as datetime_parse | from dateutil.parser import parse as datetime_parse | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from deepdoc.parser.utils import get_text | |||||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | ||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||
| callback=callback) | callback=callback) | ||||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = "" | |||||
| if binary: | |||||
| encoding = find_codec(binary) | |||||
| txt = binary.decode(encoding, errors="ignore") | |||||
| else: | |||||
| with open(filename, "r") as f: | |||||
| while True: | |||||
| l = f.readline() | |||||
| if not l: | |||||
| break | |||||
| txt += l | |||||
| txt = get_text(filename, binary) | |||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| fails = [] | fails = [] | ||||
| headers = lines[0].split(kwargs.get("delimiter", "\t")) | headers = lines[0].split(kwargs.get("delimiter", "\t")) |