Browse Source

Support displaying tables in the chunks of pdf file when using QA parser (#1263)

### What problem does this PR solve?

Support displaying tables in the chunks of pdf file when using QA parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.8.0
Zhedong Cen 1 year ago
parent
commit
b75bb1d8d3
No account linked to committer's email address
4 changed files with 9 additions and 7 deletions
  1. 5
    6
      rag/app/qa.py
  2. 1
    0
      requirements.txt
  3. 2
    1
      requirements_arm.txt
  4. 1
    0
      requirements_dev.txt

+ 5
- 6
rag/app/qa.py View File

@@ -22,6 +22,7 @@ from rag.settings import cron_logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
from PIL import Image
from markdown import markdown
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None):
if not binary:
@@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
code_block = False
level_index = [-1] * 7
for index, l in enumerate(lines):
if not l.strip():
continue
if l.strip().startswith('```'):
code_block = not code_block
question_level, question = 0, ''
@@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{l}'
else: # is a question
if last_answer:
if last_answer.strip():
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
last_answer = ''
i = question_level
@@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
level_stack.pop()
question_stack.append(question)
level_stack.append(question_level)
if last_answer:
if last_answer.strip():
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
return res
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()

+ 1
- 0
requirements.txt View File

@@ -143,3 +143,4 @@ webdriver-manager==4.0.1
cn2an==0.5.22
roman-numbers==1.0.2
word2number==1.1
markdown==3.6

+ 2
- 1
requirements_arm.txt View File

@@ -143,4 +143,5 @@ selenium==4.21.0
webdriver-manager==4.0.1
cn2an==0.5.22
roman-numbers==1.0.2
word2number==1.1
word2number==1.1
markdown==3.6

+ 1
- 0
requirements_dev.txt View File

@@ -129,3 +129,4 @@ html_text==0.6.2
cn2an==0.5.22
roman-numbers==1.0.2
word2number==1.1
markdown==3.6

Loading…
Cancel
Save