### What problem does this PR solve? ### Type of change - [x] Refactoringtags/v0.13.0
| @@ -10,9 +10,7 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import io | |||
| import re | |||
| import numpy as np | |||
| from api.db import LLMType | |||
| from rag.nlp import rag_tokenizer | |||
| @@ -15,9 +15,9 @@ import re | |||
| from io import BytesIO | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | |||
| tokenize_chunks, find_codec | |||
| from rag.nlp import bullets_category, is_english,remove_contents_table, \ | |||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ | |||
| tokenize_chunks | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| @@ -10,7 +10,6 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| @@ -18,8 +17,8 @@ from docx import Document | |||
| from api.db import ParserType | |||
| from deepdoc.parser.utils import get_text | |||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||
| from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \ | |||
| make_colon_as_title, tokenize_chunks, docx_question_level | |||
| from rag.nlp import rag_tokenizer | |||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | |||
| from rag.settings import cron_logger | |||
| @@ -19,13 +19,13 @@ import re | |||
| from api.db import ParserType | |||
| from io import BytesIO | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||
| from rag.utils import num_tokens_from_string | |||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||
| from deepdoc.parser import PdfParser, PlainParser, DocxParser | |||
| from docx import Document | |||
| from PIL import Image | |||
| class Pdf(PdfParser): | |||
| def __init__(self): | |||
| self.model_speciess = ParserType.MANUAL.value | |||
| @@ -25,6 +25,7 @@ from functools import reduce | |||
| from markdown import markdown | |||
| from docx.image.exceptions import UnrecognizedImageError | |||
| class Docx(DocxParser): | |||
| def __init__(self): | |||
| pass | |||
| @@ -93,7 +94,7 @@ class Docx(DocxParser): | |||
| tbls = [] | |||
| for tb in self.doc.tables: | |||
| html= "<table>" | |||
| html = "<table>" | |||
| for r in tb.rows: | |||
| html += "<tr>" | |||
| i = 0 | |||
| @@ -146,8 +147,6 @@ class Pdf(PdfParser): | |||
| class Markdown(MarkdownParser): | |||
| def __call__(self, filename, binary=None): | |||
| txt = "" | |||
| tbls = [] | |||
| if binary: | |||
| encoding = find_codec(binary) | |||
| txt = binary.decode(encoding, errors="ignore") | |||
| @@ -12,13 +12,11 @@ | |||
| # | |||
| import copy | |||
| import re | |||
| from collections import Counter | |||
| from api.db import ParserType | |||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||
| from deepdoc.parser import PdfParser, PlainParser | |||
| import numpy as np | |||
| from rag.utils import num_tokens_from_string | |||
| class Pdf(PdfParser): | |||
| @@ -135,7 +133,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| Only pdf is supported. | |||
| The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | |||
| """ | |||
| pdf_parser = None | |||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| if not kwargs.get("parser_config", {}).get("layout_recognize", True): | |||
| pdf_parser = PlainParser() | |||
| @@ -14,7 +14,6 @@ import re | |||
| from copy import deepcopy | |||
| from io import BytesIO | |||
| from timeit import default_timer as timer | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| from deepdoc.parser.utils import get_text | |||