浏览代码

rename get_txt to get_text (#2649)

### What problem does this PR solve?



### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.12.0
Kevin Hu 1年前
父节点
当前提交
fc867cb959
没有帐户链接到提交者的电子邮件
共有 6 个文件被更改,包括 13 次插入12 次删除
  1. 1
    1
      deepdoc/parser/utils.py
  2. 0
    1
      rag/app/book.py
  3. 2
    2
      rag/app/laws.py
  4. 3
    3
      rag/app/one.py
  5. 6
    4
      rag/app/qa.py
  6. 1
    1
      rag/app/table.py

+ 1
- 1
deepdoc/parser/utils.py 查看文件

from rag.nlp import find_codec from rag.nlp import find_codec




def get_txt(fnm: str, binary=None) -> str:
def get_text(fnm: str, binary=None) -> str:
txt = "" txt = ""
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)

+ 0
- 1
rag/app/book.py 查看文件

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import copy
from tika import parser from tika import parser
import re import re
from io import BytesIO from io import BytesIO

+ 2
- 2
rag/app/laws.py 查看文件

from docx import Document from docx import Document


from api.db import ParserType from api.db import ParserType
from deepdoc.parser.utils import get_txt
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer


elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary)
txt = get_text(filename, binary)
sections = txt.split("\n") sections = txt.split("\n")
sections = [l for l in sections if l] sections = [l for l in sections if l]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

+ 3
- 3
rag/app/one.py 查看文件

from io import BytesIO from io import BytesIO
import re import re


from deepdoc.parser.utils import get_txt
from deepdoc.parser.utils import get_text
from rag.app import laws from rag.app import laws
from rag.nlp import rag_tokenizer, tokenize, find_codec
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser






elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary)
txt = get_text(filename, binary)
sections = txt.split("\n") sections = txt.split("\n")
sections = [s for s in sections if s] sections = [s for s in sections if s]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

+ 6
- 4
rag/app/qa.py 查看文件

from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook


from deepdoc.parser.utils import get_txt
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from rag.settings import cron_logger from rag.settings import cron_logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document from docx import Document
from PIL import Image from PIL import Image
from markdown import markdown from markdown import markdown


class Excel(ExcelParser): class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None): def __call__(self, fnm, binary=None, callback=None):
if not binary: if not binary:
return res return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary)
txt = get_text(filename, binary)
lines = txt.split("\n") lines = txt.split("\n")
comma, tab = 0, 0 comma, tab = 0, 0
for l in lines: for l in lines:
return res return res
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_txt(filename, binary)
txt = get_text(filename, binary)
lines = txt.split("\n") lines = txt.split("\n")
last_question, last_answer = "", "" last_question, last_answer = "", ""
question_stack, level_stack = [], [] question_stack, level_stack = [], []

+ 1
- 1
rag/app/table.py 查看文件



from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser





正在加载...
取消
保存