### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.12.0
| @@ -14,7 +14,7 @@ | |||
| from rag.nlp import find_codec | |||
| def get_txt(fnm: str, binary=None) -> str: | |||
| def get_text(fnm: str, binary=None) -> str: | |||
| txt = "" | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| @@ -10,7 +10,6 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| @@ -17,7 +17,7 @@ from io import BytesIO | |||
| from docx import Document | |||
| from api.db import ParserType | |||
| from deepdoc.parser.utils import get_txt | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||
| from rag.nlp import rag_tokenizer | |||
| @@ -166,7 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = get_txt(filename, binary) | |||
| txt = get_text(filename, binary) | |||
| sections = txt.split("\n") | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| @@ -14,9 +14,9 @@ from tika import parser | |||
| from io import BytesIO | |||
| import re | |||
| from deepdoc.parser.utils import get_txt | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.app import laws | |||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||
| from rag.nlp import rag_tokenizer, tokenize | |||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | |||
| @@ -84,7 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = get_txt(filename, binary) | |||
| txt = get_text(filename, binary) | |||
| sections = txt.split("\n") | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| @@ -17,14 +17,16 @@ from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from deepdoc.parser.utils import get_txt | |||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level | |||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | |||
| from rag.settings import cron_logger | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| from markdown import markdown | |||
| class Excel(ExcelParser): | |||
| def __call__(self, fnm, binary=None, callback=None): | |||
| if not binary: | |||
| @@ -307,7 +309,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| return res | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = get_txt(filename, binary) | |||
| txt = get_text(filename, binary) | |||
| lines = txt.split("\n") | |||
| comma, tab = 0, 0 | |||
| for l in lines: | |||
| @@ -350,7 +352,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| return res | |||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| txt = get_txt(filename, binary) | |||
| txt = get_text(filename, binary) | |||
| lines = txt.split("\n") | |||
| last_question, last_answer = "", "" | |||
| question_stack, level_stack = [], [] | |||
| @@ -21,7 +21,7 @@ from dateutil.parser import parse as datetime_parse | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||
| from rag.nlp import rag_tokenizer, tokenize | |||
| from deepdoc.parser import ExcelParser | |||