### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.12.0
| from rag.nlp import find_codec | from rag.nlp import find_codec | ||||
| def get_txt(fnm: str, binary=None) -> str: | |||||
| def get_text(fnm: str, binary=None) -> str: | |||||
| txt = "" | txt = "" | ||||
| if binary: | if binary: | ||||
| encoding = find_codec(binary) | encoding = find_codec(binary) |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import copy | |||||
| from tika import parser | from tika import parser | ||||
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO |
| from docx import Document | from docx import Document | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from deepdoc.parser.utils import get_text | |||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | ||||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | ||||
| from rag.nlp import rag_tokenizer | from rag.nlp import rag_tokenizer | ||||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | elif re.search(r"\.txt$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = get_txt(filename, binary) | |||||
| txt = get_text(filename, binary) | |||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [l for l in sections if l] | sections = [l for l in sections if l] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") |
| from io import BytesIO | from io import BytesIO | ||||
| import re | import re | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from deepdoc.parser.utils import get_text | |||||
| from rag.app import laws | from rag.app import laws | ||||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||||
| from rag.nlp import rag_tokenizer, tokenize | |||||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser | ||||
| elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = get_txt(filename, binary) | |||||
| txt = get_text(filename, binary) | |||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [s for s in sections if s] | sections = [s for s in sections if s] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") |
| from nltk import word_tokenize | from nltk import word_tokenize | ||||
| from openpyxl import load_workbook | from openpyxl import load_workbook | ||||
| from deepdoc.parser.utils import get_txt | |||||
| from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level | |||||
| from deepdoc.parser.utils import get_text | |||||
| from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level | |||||
| from rag.nlp import rag_tokenizer, tokenize_table, concat_img | from rag.nlp import rag_tokenizer, tokenize_table, concat_img | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | from deepdoc.parser import PdfParser, ExcelParser, DocxParser | ||||
| from docx import Document | from docx import Document | ||||
| from PIL import Image | from PIL import Image | ||||
| from markdown import markdown | from markdown import markdown | ||||
| class Excel(ExcelParser): | class Excel(ExcelParser): | ||||
| def __call__(self, fnm, binary=None, callback=None): | def __call__(self, fnm, binary=None, callback=None): | ||||
| if not binary: | if not binary: | ||||
| return res | return res | ||||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = get_txt(filename, binary) | |||||
| txt = get_text(filename, binary) | |||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| comma, tab = 0, 0 | comma, tab = 0, 0 | ||||
| for l in lines: | for l in lines: | ||||
| return res | return res | ||||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| txt = get_txt(filename, binary) | |||||
| txt = get_text(filename, binary) | |||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| last_question, last_answer = "", "" | last_question, last_answer = "", "" | ||||
| question_stack, level_stack = [], [] | question_stack, level_stack = [], [] |
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from deepdoc.parser.utils import get_text | from deepdoc.parser.utils import get_text | ||||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||||
| from rag.nlp import rag_tokenizer, tokenize | |||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||