Przeglądaj źródła

let presentation do raptor (#2838)

### What problem does this PR solve?

#2837

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.13.0
Kevin Hu 1 rok temu
rodzic
commit
b540d41cdc
No account linked to committer's email address
2 zmienionych plików z 13 dodań i 3 usunięć
  1. 3
    2
      api/apps/document_app.py
  2. 10
    1
      rag/app/qa.py

+ 3
- 2
api/apps/document_app.py Wyświetl plik

else: else:
return get_json_result(data=True) return get_json_result(data=True)


if doc.type == FileType.VISUAL or re.search(
r"\.(ppt|pptx|pages)$", doc.name):
if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
or (re.search(
r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
return get_data_error_result(retmsg="Not supported yet!") return get_data_error_result(retmsg="Not supported yet!")


e = DocumentService.update_by_id(doc.id, e = DocumentService.update_by_id(doc.id,

+ 10
- 1
rag/app/qa.py Wyświetl plik

[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1]) [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
return res return res



class Pdf(PdfParser): class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None): to_page=100000, zoomin=3, callback=None):
if last_q: if last_q:
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True))) qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
return qai_list, tbls return qai_list, tbls

def get_tbls_info(self, tbls, tbl_index): def get_tbls_info(self, tbls, tbl_index):
if tbl_index >= len(tbls): if tbl_index >= len(tbls):
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom) .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
tbl_text = ''.join(tbls[tbl_index][0][1]) tbl_text = ''.join(tbls[tbl_index][0][1])
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,


class Docx(DocxParser): class Docx(DocxParser):
def __init__(self): def __init__(self):
pass pass

def get_picture(self, document, paragraph): def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic') img = paragraph._element.xpath('.//pic:pic')
if not img: if not img:
tbls.append(((None, html), "")) tbls.append(((None, html), ""))
return qai_list, tbls return qai_list, tbls



def rmPrefix(txt): def rmPrefix(txt):
return re.sub( return re.sub(
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
add_positions(d, poss) add_positions(d, poss)
return d return d



def beAdocDocx(d, q, a, eng, image): def beAdocDocx(d, q, a, eng, image):
qprefix = "Question: " if eng else "问题:" qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:" aprefix = "Answer: " if eng else "回答:"
d["image"] = image d["image"] = image
return d return d



def beAdoc(d, q, a, eng): def beAdoc(d, q, a, eng):
qprefix = "Question: " if eng else "问题:" qprefix = "Question: " if eng else "问题:"
aprefix = "Answer: " if eng else "回答:" aprefix = "Answer: " if eng else "回答:"
match = re.match(r'#*', s) match = re.match(r'#*', s)
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)



def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
""" """
Excel and csv(txt) format files are supported. Excel and csv(txt) format files are supported.

Ładowanie…
Anuluj
Zapisz