瀏覽代碼

fix gb2312 encoding issue (#394)

### What problem does this PR solve?

Issue link:#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.3.0
KevinHuSh 1 年之前
父節點
當前提交
d4e0bfc8a5
沒有連結到貢獻者的電子郵件帳戶。
共有 2 個檔案被更改,包括 6 行新增4 行删除
  1. 5
    3
      rag/app/naive.py
  2. 1
    1
      rag/nlp/search.py

+ 5
- 3
rag/app/naive.py 查看文件

@@ -14,8 +14,7 @@ from io import BytesIO
from docx import Document
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.app import laws
from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger
@@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
txt = ""
if binary:
txt = binary.decode("utf-8")
try:
txt = binary.decode("utf-8")
except Exception as e:
txt = binary.decode("gb2312")
else:
with open(filename, "r") as f:
while True:

+ 1
- 1
rag/nlp/search.py 查看文件

@@ -237,7 +237,7 @@ class Dealer:
pieces_.append(t)
es_logger.info("{} => {}".format(answer, pieces_))
if not pieces_:
return answer
return answer, set([])

ans_v, _ = embd_mdl.encode(pieces_)
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(

Loading…
取消
儲存