Ver código fonte

Some document API refined. (#53)

Add naive chunking method to RAG
tags/v0.1.0
KevinHuSh 1 ano atrás
pai
commit
51482f3e2a

+ 8
- 10
api/apps/document_app.py Ver arquivo

orderby = request.args.get("orderby", "create_time") orderby = request.args.get("orderby", "create_time")
desc = request.args.get("desc", True) desc = request.args.get("desc", True)
try: try:
docs = DocumentService.get_by_kb_id(
docs, tol = DocumentService.get_by_kb_id(
kb_id, page_number, items_per_page, orderby, desc, keywords) kb_id, page_number, items_per_page, orderby, desc, keywords)
return get_json_result(data=docs)
return get_json_result(data={"total":tol, "docs": docs})
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@manager.route('/rename', methods=['POST']) @manager.route('/rename', methods=['POST'])
@login_required @login_required
@validate_request("doc_id", "name", "old_name")
@validate_request("doc_id", "name")
def rename(): def rename():
req = request.json req = request.json
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
req["old_name"].lower()).suffix:
get_json_result(
data=False,
retmsg="The extension of file can't be changed",
retcode=RetCode.ARGUMENT_ERROR)
try: try:
e, doc = DocumentService.get_by_id(req["doc_id"]) e, doc = DocumentService.get_by_id(req["doc_id"])
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return get_json_result(
data=False,
retmsg="The extension of file can't be changed",
retcode=RetCode.ARGUMENT_ERROR)
if DocumentService.query(name=req["name"], kb_id=doc.kb_id): if DocumentService.query(name=req["name"], kb_id=doc.kb_id):
return get_data_error_result( return get_data_error_result(
retmsg="Duplicated document name in the same knowledgebase.") retmsg="Duplicated document name in the same knowledgebase.")

+ 2
- 1
api/db/services/document_service.py Ver arquivo

cls.model.name.like(f"%%{keywords}%%")) cls.model.name.like(f"%%{keywords}%%"))
else: else:
docs = cls.model.select().where(cls.model.kb_id == kb_id) docs = cls.model.select().where(cls.model.kb_id == kb_id)
count = docs.count()
if desc: if desc:
docs = docs.order_by(cls.model.getter_by(orderby).desc()) docs = docs.order_by(cls.model.getter_by(orderby).desc())
else: else:
docs = docs.paginate(page_number, items_per_page) docs = docs.paginate(page_number, items_per_page)
return list(docs.dicts())
return list(docs.dicts()), count
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()

+ 0
- 91
rag/app/__init__.py Ver arquivo

import re
from nltk import word_tokenize
from rag.nlp import stemmer, huqie
BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百]+(编|部分)",
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"第[零一二三四五六七八九十百]+条",
r"[\((][零一二三四五六七八九十百]+[\))]",
], [
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百]+章",
r"第[零一二三四五六七八九十百]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\((][零一二三四五六七八九十百]+[\))]",
r"[\((][0-9]{,2}[\))]",
] ,[
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]
def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i,h in enumerate(hits):
if h <= maxium:continue
res = i
maxium = h
return res
def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j):sections.pop(i)
break

+ 16
- 58
rag/app/book.py Ver arquivo

import copy import copy
import random import random
import re import re
from io import BytesIO
from docx import Document
import numpy as np import numpy as np
from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table
from rag.parser import bullets_category, BULLET_PATTERN, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
self._table_transformer_job(zoomin) self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished") callback(0.68, "Table analysis finished")
self._text_merge() self._text_merge()
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
self._merge_with_same_bullet() self._merge_with_same_bullet()
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes]
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
doc_parser = HuDocxParser() doc_parser = HuDocxParser()
# TODO: table of contents need to be removed # TODO: table of contents need to be removed
sections, tbls = doc_parser(binary if binary else filename)
remove_contents_table(sections, eng = is_english(random.choices([t for t,_ in sections], k=200)))
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(random.choices([t for t,_ in sections], k=200)))
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
bull = bullets_category([b["text"] for b in random.choices([t for t,_ in sections], k=100)])
projs = [len(BULLET_PATTERN[bull]) + 1] * len(sections)
levels = [[]] * len(BULLET_PATTERN[bull]) + 2
for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()):
projs[i] = j
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
projs[i] = BULLET_PATTERN[bull]
levels[BULLET_PATTERN[bull]].append(i)
else:
levels[BULLET_PATTERN[bull] + 1].append(i)
sections = [t for t,_ in sections]
def binary_search(arr, target):
if target > arr[-1]: return len(arr) - 1
if target > arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s
cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: break
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
make_colon_as_title(sections)
bull = bullets_category([t for t in random.choices([t for t,_ in sections], k=100)])
if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
else: cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
sections = [t for t, _ in sections]
# is it English # is it English
eng = is_english(random.choices(sections, k=218)) eng = is_english(random.choices(sections, k=218))
tokenize(d, r, eng) tokenize(d, r, eng)
d["image"] = img d["image"] = img
res.append(d) res.append(d)
print("TABLE", d["content_with_weight"])
# wrap up to es documents # wrap up to es documents
for ck in cks: for ck in cks:
print("\n-".join(ck[::-1]))
ck = "\n".join(ck[::-1])
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
ck = "\n".join(ck)
if pdf_parser: if pdf_parser:
d["image"] = pdf_parser.crop(ck) d["image"] = pdf_parser.crop(ck)
ck = pdf_parser.remove_tag(ck) ck = pdf_parser.remove_tag(ck)
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1])
def dummy(a, b):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

+ 26
- 84
rag/app/laws.py Ver arquivo

from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
import numpy as np import numpy as np
from rag.app import bullets_category, BULLET_PATTERN, is_english, tokenize
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.docx_parser import HuDocxParser from rag.parser.docx_parser import HuDocxParser
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Docx(HuDocxParser): class Docx(HuDocxParser):
line = re.sub(r"\u3000", " ", line).strip() line = re.sub(r"\u3000", " ", line).strip()
return line return line
def __call__(self, filename, binary=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document( self.doc = Document(
filename) if not binary else Document(BytesIO(binary)) filename) if not binary else Document(BytesIO(binary))
lines = [self.__clean(p.text) for p in self.doc.paragraphs]
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:break
if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [l for l in lines if l] return [l for l in lines if l]
start = timer() start = timer()
self._layouts_paddle(zoomin) self._layouts_paddle(zoomin)
callback(0.77, "Layout analysis finished") callback(0.77, "Layout analysis finished")
print("paddle layouts:", timer()-start)
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
# is it English
eng = is_english([b["text"] for b in bxs])
# Merge vertically
i = 0
while i + 1 < len(bxs):
b = bxs[i]
b_ = bxs[i + 1]
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i)
continue
concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
b["text"].strip()[0] in "。;?!?”)),,、:",
]
# features for not concating
feats = [
b.get("layoutno",0) != b.get("layoutno",0),
b["text"].strip()[-1] in "。?!?",
eng and b["text"].strip()[-1] in ".!?",
b["page_number"] == b_["page_number"] and b_["top"] - \
b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
b["page_number"] < b_["page_number"] and abs(
b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
]
if any(feats) and not any(concatting_feats):
i += 1
continue
# merge up and down
b["bottom"] = b_["bottom"]
b["text"] += b_["text"]
b["x0"] = min(b["x0"], b_["x0"])
b["x1"] = max(b["x1"], b_["x1"])
bxs.pop(i + 1)
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
self._naive_vertical_merge()
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")
return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
# is it English # is it English
eng = is_english(sections) eng = is_english(sections)
# Remove 'Contents' part # Remove 'Contents' part
i = 0
while i < len(sections):
if not re.match(r"(contents|目录|目次|table of contents)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i+128, len(sections))):
if not re.match(prefix, sections[j]):
continue
for _ in range(i, j):sections.pop(i)
break
remove_contents_table(sections, eng)
make_colon_as_title(sections)
bull = bullets_category(sections) bull = bullets_category(sections)
projs = [len(BULLET_PATTERN[bull])] * len(sections)
for i, sec in enumerate(sections):
for j,p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, sec.strip()):
projs[i] = j
break
readed = [0] * len(sections)
cks = []
for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1):
for i in range(len(sections)):
if readed[i] or projs[i] < pr:
continue
# find father and grand-father and grand...father
p = projs[i]
readed[i] = 1
ck = [sections[i]]
for j in range(i-1, -1, -1):
if projs[j] >= p:continue
ck.append(sections[j])
readed[j] = 1
p = projs[j]
if p == 0: break
cks.append(ck[::-1])
cks = hierarchical_merge(bull, sections, 3)
if not cks: callback(0.99, "No chunk parsed out.")
res = [] res = []
# wrap up to es documents # wrap up to es documents
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1])
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

+ 5
- 4
rag/app/manual.py Ver arquivo

import copy import copy
import re import re
from rag.app import tokenize
from rag.parser import tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
pdf_parser = None pdf_parser = None
paper = {} paper = {}
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1])
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

+ 79
- 0
rag/app/naive.py Ver arquivo

import copy
import re
from rag.app import laws
from rag.parser import is_english, tokenize, naive_merge
from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page)
callback(0.1, "OCR finished")
from timeit import default_timer as timer
start = timer()
self._layouts_paddle(zoomin)
callback(0.77, "Layout analysis finished")
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
self._naive_vertical_merge()
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None
sections = []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in laws.Docx()(filename, binary):
sections.append((txt, ""))
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
sections = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
txt += l
sections = txt.split("\n")
sections = [(l,"") for l in sections if l]
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
cks = naive_merge(sections, kwargs.get("chunk_token_num", 128), kwargs.get("delimer", "\n。;!?"))
eng = is_english(cks)
res = []
# wrap up to es documents
for ck in cks:
print("--", ck)
d = copy.deepcopy(doc)
if pdf_parser:
d["image"] = pdf_parser.crop(ck)
ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng)
res.append(d)
return res
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

+ 5
- 4
rag/app/paper.py Ver arquivo

import copy import copy
import re import re
from collections import Counter from collections import Counter
from rag.app import tokenize
from rag.parser import tokenize
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
import numpy as np import numpy as np
} }
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
pdf_parser = None pdf_parser = None
paper = {} paper = {}
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
chunk(sys.argv[1])
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

+ 5
- 3
rag/app/presentation.py Ver arquivo

from io import BytesIO from io import BytesIO
from pptx import Presentation from pptx import Presentation
from rag.app import tokenize, is_english
from rag.parser import tokenize, is_english
from rag.nlp import huqie from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
return res return res
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
if __name__== "__main__": if __name__== "__main__":
import sys import sys
print(chunk(sys.argv[1]))
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

+ 4
- 4
rag/app/qa.py Ver arquivo

from io import BytesIO from io import BytesIO
from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from rag.app import is_english
from rag.parser import is_english
from rag.nlp import huqie, stemmer from rag.nlp import huqie, stemmer
return d return d
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
def chunk(filename, binary=None, callback=None, **kwargs):
res = [] res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
if __name__== "__main__": if __name__== "__main__":
import sys import sys
def kk(rat, ss):
def dummy(a, b):
pass pass
print(chunk(sys.argv[1], callback=kk))
chunk(sys.argv[1], callback=dummy)

+ 217
- 0
rag/parser/__init__.py Ver arquivo

import copy

from .pdf_parser import HuParser as PdfParser from .pdf_parser import HuParser as PdfParser
from .docx_parser import HuDocxParser as DocxParser from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser from .excel_parser import HuExcelParser as ExcelParser

import re

from nltk import word_tokenize

from rag.nlp import stemmer, huqie
from ..utils import num_tokens_from_string

BULLET_PATTERN = [[
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"第[零一二三四五六七八九十百0-9]+条",
r"[\((][零一二三四五六七八九十百]+[\))]",
], [
r"第[0-9]+章",
r"第[0-9]+节",
r"[0-9]{,3}[\. 、]",
r"[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
], [
r"第[零一二三四五六七八九十百0-9]+章",
r"第[零一二三四五六七八九十百0-9]+节",
r"[零一二三四五六七八九十百]+[ 、]",
r"[\((][零一二三四五六七八九十百]+[\))]",
r"[\((][0-9]{,2}[\))]",
], [
r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
r"Chapter (I+V?|VI*|XI|IX|X)",
r"Section [0-9]+",
r"Article [0-9]+"
]
]


def bullets_category(sections):
global BULLET_PATTERN
hits = [0] * len(BULLET_PATTERN)
for i, pro in enumerate(BULLET_PATTERN):
for sec in sections:
for p in pro:
if re.match(p, sec):
hits[i] += 1
break
maxium = 0
res = -1
for i, h in enumerate(hits):
if h <= maxium: continue
res = i
maxium = h
return res


def is_english(texts):
eng = 0
for t in texts:
if re.match(r"[a-zA-Z]{2,}", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False


def tokenize(d, t, eng):
d["content_with_weight"] = t
if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
else:
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):
def get(i):
nonlocal sections
return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()

if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
i += 1
continue
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
while not prefix:
sections.pop(i)
if i >= len(sections): break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
sections.pop(i)
if i >= len(sections) or not prefix: break
for j in range(i, min(i + 128, len(sections))):
if not re.match(prefix, get(j)):
continue
for _ in range(i, j): sections.pop(i)
break


def make_colon_as_title(sections):
if not sections: return []
if type(sections[0]) == type(""): return sections
i = 0
while i < len(sections):
txt, layout = sections[i]
i += 1
txt = txt.split("@")[0].strip()
if not txt:
continue
if txt[-1] not in "::":
continue
txt = txt[::-1]
arr = re.split(r"([。?!!?;;]| .)", txt)
if len(arr) < 2 or len(arr[1]) < 32:
continue
sections.insert(i - 1, (arr[0][::-1], "title"))
i += 1


def hierarchical_merge(bull, sections, depth):
if not sections or bull < 0: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
bullets_size = len(BULLET_PATTERN[bull])
levels = [[] for _ in range(bullets_size + 2)]

def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
if len(txt) >= 128: return True
return re.search(r"[,;,。;!!]", txt)

for i, (txt, layout) in enumerate(sections):
for j, p in enumerate(BULLET_PATTERN[bull]):
if re.match(p, txt.strip()) and not not_title(txt):
levels[j].append(i)
break
else:
if re.search(r"(title|head)", layout):
levels[bullets_size].append(i)
else:
levels[bullets_size + 1].append(i)
sections = [t for t, _ in sections]
for s in sections: print("--", s)

def binary_search(arr, target):
if not arr: return -1
if target > arr[-1]: return len(arr) - 1
if target < arr[0]: return -1
s, e = 0, len(arr)
while e - s > 1:
i = (e + s) // 2
if target > arr[i]:
s = i
continue
elif target < arr[i]:
e = i
continue
else:
assert False
return s

cks = []
readed = [False] * len(sections)
levels = levels[::-1]
for i, arr in enumerate(levels[:depth]):
for j in arr:
if readed[j]: continue
readed[j] = True
cks.append([j])
if i + 1 == len(levels) - 1: continue
for ii in range(i + 1, len(levels)):
jj = binary_search(levels[ii], j)
if jj < 0: continue
if jj > cks[-1][-1]: cks[-1].pop(-1)
cks[-1].append(levels[ii][jj])
for ii in cks[-1]: readed[ii] = True
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i]))

return cks


def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
if not sections: return []
if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
cks = [""]
tk_nums = [0]
def add_chunk(t, pos):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8: pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t + pos)
tk_nums.append(tnum)
else:
cks[-1] += t + pos
tk_nums[-1] += tnum

for sec, pos in sections:
s, e = 0, 1
while e < len(sec):
if sec[e] in delimiter:
add_chunk(sec[s: e+1], pos)
s = e + 1
e = s + 1
else:
e += 1
if s < e: add_chunk(sec[s: e], pos)

return cks



+ 13
- 2
rag/parser/docx_parser.py Ver arquivo

return lines return lines
return ["\n".join(lines)] return ["\n".join(lines)]


def __call__(self, fnm):
def __call__(self, fnm, from_page=0, to_page=100000):
self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm)) self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
secs = [(p.text, p.style.name) for p in self.doc.paragraphs]
pn = 0
secs = []
for p in self.doc.paragraphs:
if pn > to_page: break
if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1

tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls return secs, tbls

+ 67
- 7
rag/parser/pdf_parser.py Ver arquivo

i += 1 i += 1
self.boxes = bxs self.boxes = bxs


def _naive_vertical_merge(self):
bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
i = 0
while i + 1 < len(bxs):
b = bxs[i]
b_ = bxs[i + 1]
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i)
continue
concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
b["text"].strip()[0] in "。;?!?”)),,、:",
]
# features for not concating
feats = [
b.get("layoutno", 0) != b.get("layoutno", 0),
b["text"].strip()[-1] in "。?!?",
self.is_english and b["text"].strip()[-1] in ".!?",
b["page_number"] == b_["page_number"] and b_["top"] - \
b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
b["page_number"] < b_["page_number"] and abs(
b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
]
if any(feats) and not any(concatting_feats):
i += 1
continue
# merge up and down
b["bottom"] = b_["bottom"]
b["text"] += b_["text"]
b["x0"] = min(b["x0"], b_["x0"])
b["x1"] = max(b["x1"], b_["x1"])
bxs.pop(i + 1)
self.boxes = bxs

def _concat_downward(self, concat_between_pages=True): def _concat_downward(self, concat_between_pages=True):
# count boxes in the same row as a feature # count boxes in the same row as a feature
for i in range(len(self.boxes)): for i in range(len(self.boxes)):
def _filter_forpages(self): def _filter_forpages(self):
if not self.boxes: if not self.boxes:
return return
findit = False
i = 0 i = 0
while i < len(self.boxes): while i < len(self.boxes):
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
i += 1 i += 1
continue continue
findit = True
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip()) eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
self.boxes.pop(i) self.boxes.pop(i)
if i >= len(self.boxes): break if i >= len(self.boxes): break
continue continue
for k in range(i, j): self.boxes.pop(i) for k in range(i, j): self.boxes.pop(i)
break break
if findit:return

page_dirty = [0] * len(self.page_images)
for b in self.boxes:
if re.search(r"(··|··|··)", b["text"]):
page_dirty[b["page_number"]-1] += 1
page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
if not page_dirty: return
i = 0
while i < len(self.boxes):
if self.boxes[i]["page_number"] in page_dirty:
self.boxes.pop(i)
continue
i += 1


def _merge_with_same_bullet(self): def _merge_with_same_bullet(self):
i = 0 i = 0
while i + 1 < len(self.boxes): while i + 1 < len(self.boxes):
b = self.boxes[i] b = self.boxes[i]
b_ = self.boxes[i + 1] b_ = self.boxes[i + 1]
if not b["text"].strip():
self.boxes.pop(i)
continue
if not b_["text"].strip():
self.boxes.pop(i+1)
continue

if b["text"].strip()[0] != b_["text"].strip()[0] \ if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
or huqie.is_chinese(b["text"].strip()[0]) \
or b["top"] > b_["bottom"]: or b["top"] > b_["bottom"]:
i += 1 i += 1
continue continue
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])] enumerate(self.pdf.pages[page_from:page_to])]
self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in
range(len(self.page_images))]
self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages) self.total_page = len(self.pdf.pages)
except Exception as e: except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf") self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
self.page_chars = [] self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin) mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf) self.total_page = len(self.pdf)
for page in self.pdf[page_from:page_to]:
pix = page.getPixmap(matrix=mat)
for i, page in enumerate(self.pdf):
if i < page_from:continue
if i >= page_to:break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples) pix.samples)
self.page_images.append(img) self.page_images.append(img)
self.page_chars.append([]) self.page_chars.append([])


logging.info("Images converted.") logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=100))) for i in range(len(self.page_chars))]
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
self.is_english = True self.is_english = True
else: else:
# np.max([c["bottom"] for c in chars])) # np.max([c["bottom"] for c in chars]))
self.__ocr_paddle(i + 1, img, chars, zoomin) self.__ocr_paddle(i + 1, img, chars, zoomin)


if not self.is_english and not all([c for c in self.page_chars]) and self.boxes:
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(self.boxes, k=30)]))
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices([b for bxs in self.boxes for b in bxs], k=30)]))


logging.info("Is it English:", self.is_english) logging.info("Is it English:", self.is_english)



Carregando…
Cancelar
Salvar