Просмотр исходного кода

fix bug of table in docx (#510)

### What problem does this PR solve?
#509 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.3.1
KevinHuSh 1 год назад
Родитель
Сommit
369400c483
Аккаунт пользователя с таким Email не найден
2 измененных файлов: 5 добавлений и 4 удалений
  1. 1
    0
      rag/app/book.py
  2. 4
    4
      rag/app/naive.py

+ 1
- 0
rag/app/book.py Просмотреть файл

binary if binary else filename, from_page=from_page, to_page=to_page) binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english( remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200))) random_choices([t for t, _ in sections], k=200)))
tbls = [((None, lns), None) for lns in tbls]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):

+ 4
- 4
rag/app/naive.py Просмотреть файл

from tika import parser from tika import parser
from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
from timeit import default_timer as timer
import re import re
from deepdoc.parser.pdf_parser import PlainParser from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
class Pdf(PdfParser): class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None): to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer() start = timer()
callback(msg="OCR is running...") callback(msg="OCR is running...")
self.__images__( self.__images__(
start = timer() start = timer()
self._layouts_rec(zoomin) self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished.") callback(0.63, "Layout analysis finished.")
print("layouts:", timer() - start)
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.") callback(0.65, "Table analysis finished.")
self._text_merge() self._text_merge()
self._concat_downward() self._concat_downward()
#self._filter_forpages() #self._filter_forpages()
cron_logger.info("layouts: {}".format(
(timer() - start) / (self.total_page + 0.1)))
cron_logger.info("layouts: {}".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin)) return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls for b in self.boxes], tbls
raise NotImplementedError( raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)") "file type not supported yet(doc, docx, pdf, txt supported)")
st = timer()
chunks = naive_merge( chunks = naive_merge(
sections, parser_config.get( sections, parser_config.get(
"chunk_token_num", 128), parser_config.get( "chunk_token_num", 128), parser_config.get(
"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res

Загрузка…
Отмена
Сохранить