### What problem does this PR solve? ### Type of change - [x] Refactoringtags/v0.13.0
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import io | |||||
| import re | import re | ||||
| import numpy as np | |||||
| from api.db import LLMType | from api.db import LLMType | ||||
| from rag.nlp import rag_tokenizer | from rag.nlp import rag_tokenizer |
| from io import BytesIO | from io import BytesIO | ||||
| from deepdoc.parser.utils import get_text | from deepdoc.parser.utils import get_text | ||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | |||||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | |||||
| tokenize_chunks, find_codec | |||||
| from rag.nlp import bullets_category, is_english,remove_contents_table, \ | |||||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ | |||||
| tokenize_chunks | |||||
| from rag.nlp import rag_tokenizer | from rag.nlp import rag_tokenizer | ||||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | ||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import copy | |||||
| from tika import parser | from tika import parser | ||||
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from deepdoc.parser.utils import get_text | from deepdoc.parser.utils import get_text | ||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | |||||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level | |||||
| from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \ | |||||
| make_colon_as_title, tokenize_chunks, docx_question_level | |||||
| from rag.nlp import rag_tokenizer | from rag.nlp import rag_tokenizer | ||||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger |
| from api.db import ParserType | from api.db import ParserType | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||||
| from deepdoc.parser import PdfParser, PlainParser | |||||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level | |||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | |||||
| from deepdoc.parser import PdfParser, PlainParser, DocxParser | |||||
| from docx import Document | from docx import Document | ||||
| from PIL import Image | from PIL import Image | ||||
| class Pdf(PdfParser): | class Pdf(PdfParser): | ||||
| def __init__(self): | def __init__(self): | ||||
| self.model_speciess = ParserType.MANUAL.value | self.model_speciess = ParserType.MANUAL.value |
| from markdown import markdown | from markdown import markdown | ||||
| from docx.image.exceptions import UnrecognizedImageError | from docx.image.exceptions import UnrecognizedImageError | ||||
| class Docx(DocxParser): | class Docx(DocxParser): | ||||
| def __init__(self): | def __init__(self): | ||||
| pass | pass | ||||
| tbls = [] | tbls = [] | ||||
| for tb in self.doc.tables: | for tb in self.doc.tables: | ||||
| html= "<table>" | |||||
| html = "<table>" | |||||
| for r in tb.rows: | for r in tb.rows: | ||||
| html += "<tr>" | html += "<tr>" | ||||
| i = 0 | i = 0 | ||||
| class Markdown(MarkdownParser): | class Markdown(MarkdownParser): | ||||
| def __call__(self, filename, binary=None): | def __call__(self, filename, binary=None): | ||||
| txt = "" | |||||
| tbls = [] | |||||
| if binary: | if binary: | ||||
| encoding = find_codec(binary) | encoding = find_codec(binary) | ||||
| txt = binary.decode(encoding, errors="ignore") | txt = binary.decode(encoding, errors="ignore") |
| # | # | ||||
| import copy | import copy | ||||
| import re | import re | ||||
| from collections import Counter | |||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | ||||
| from deepdoc.parser import PdfParser, PlainParser | from deepdoc.parser import PdfParser, PlainParser | ||||
| import numpy as np | import numpy as np | ||||
| from rag.utils import num_tokens_from_string | |||||
| class Pdf(PdfParser): | class Pdf(PdfParser): | ||||
| Only pdf is supported. | Only pdf is supported. | ||||
| The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. | ||||
| """ | """ | ||||
| pdf_parser = None | |||||
| if re.search(r"\.pdf$", filename, re.IGNORECASE): | if re.search(r"\.pdf$", filename, re.IGNORECASE): | ||||
| if not kwargs.get("parser_config", {}).get("layout_recognize", True): | if not kwargs.get("parser_config", {}).get("layout_recognize", True): | ||||
| pdf_parser = PlainParser() | pdf_parser = PlainParser() |
| from copy import deepcopy | from copy import deepcopy | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| from nltk import word_tokenize | |||||
| from openpyxl import load_workbook | from openpyxl import load_workbook | ||||
| from deepdoc.parser.utils import get_text | from deepdoc.parser.utils import get_text |