Browse Source

remove unused import (#2679)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
tags/v0.13.0
yqkcn 1 year ago
parent
commit
570ad420a8
No account linked to committer's email address
7 changed files with 10 additions and 18 deletions
  1. 0
    2
      rag/app/audio.py
  2. 3
    3
      rag/app/book.py
  3. 2
    3
      rag/app/laws.py
  4. 3
    3
      rag/app/manual.py
  5. 2
    3
      rag/app/naive.py
  6. 0
    3
      rag/app/paper.py
  7. 0
    1
      rag/app/qa.py

+ 0
- 2
rag/app/audio.py View File

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import io
import re import re
import numpy as np


from api.db import LLMType from api.db import LLMType
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer

+ 3
- 3
rag/app/book.py View File

from io import BytesIO from io import BytesIO


from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
tokenize_chunks, find_codec
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser



+ 2
- 3
rag/app/laws.py View File

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import copy
from tika import parser from tika import parser
import re import re
from io import BytesIO from io import BytesIO


from api.db import ParserType from api.db import ParserType
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \
make_colon_as_title, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from rag.settings import cron_logger from rag.settings import cron_logger

+ 3
- 3
rag/app/manual.py View File



from api.db import ParserType from api.db import ParserType
from io import BytesIO from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from deepdoc.parser import PdfParser, PlainParser
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document from docx import Document
from PIL import Image from PIL import Image



class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
self.model_speciess = ParserType.MANUAL.value self.model_speciess = ParserType.MANUAL.value

+ 2
- 3
rag/app/naive.py View File

from markdown import markdown from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError from docx.image.exceptions import UnrecognizedImageError



class Docx(DocxParser): class Docx(DocxParser):
def __init__(self): def __init__(self):
pass pass


tbls = [] tbls = []
for tb in self.doc.tables: for tb in self.doc.tables:
html= "<table>"
html = "<table>"
for r in tb.rows: for r in tb.rows:
html += "<tr>" html += "<tr>"
i = 0 i = 0


class Markdown(MarkdownParser): class Markdown(MarkdownParser):
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None):
txt = ""
tbls = []
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")

+ 0
- 3
rag/app/paper.py View File

# #
import copy import copy
import re import re
from collections import Counter


from api.db import ParserType from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
import numpy as np import numpy as np
from rag.utils import num_tokens_from_string




class Pdf(PdfParser): class Pdf(PdfParser):
Only pdf is supported. Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
""" """
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
if not kwargs.get("parser_config", {}).get("layout_recognize", True): if not kwargs.get("parser_config", {}).get("layout_recognize", True):
pdf_parser = PlainParser() pdf_parser = PlainParser()

+ 0
- 1
rag/app/qa.py View File

from copy import deepcopy from copy import deepcopy
from io import BytesIO from io import BytesIO
from timeit import default_timer as timer from timeit import default_timer as timer
from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook


from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text

Loading…
Cancel
Save