@@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin): | |||
avatar = TextField(null=True, help_text="avatar base64 string") | |||
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") | |||
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") | |||
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai") | |||
last_login_time = DateTimeField(null=True) | |||
is_authenticated = CharField(max_length=1, null=False, default="1") | |||
is_active = CharField(max_length=1, null=False, default="1") |
@@ -313,9 +313,19 @@ class HuParser: | |||
while i < len(bxs) - 1: | |||
b = bxs[i] | |||
b_ = bxs[i + 1] | |||
if b.get("layoutno", "0") != b_.get("layoutno", "1"): | |||
if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: | |||
i += 1 | |||
continue | |||
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: | |||
# merge | |||
bxs[i]["x1"] = b_["x1"] | |||
bxs[i]["top"] = (b["top"] + b_["top"]) / 2 | |||
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 | |||
bxs[i]["text"] += b_["text"] | |||
bxs.pop(i + 1) | |||
continue | |||
i += 1 | |||
continue | |||
dis_thr = 1 | |||
dis = b["x1"] - b_["x0"] | |||
@@ -642,9 +652,9 @@ class HuParser: | |||
tk, tv = nearest(tables) | |||
fk, fv = nearest(figures) | |||
if min(tv, fv) > 2000: | |||
i += 1 | |||
continue | |||
#if min(tv, fv) > 2000: | |||
# i += 1 | |||
# continue | |||
if tv < fv: | |||
tables[tk].insert(0, c) | |||
logging.debug( | |||
@@ -711,12 +721,7 @@ class HuParser: | |||
# crop figure out and add caption | |||
for k, bxs in figures.items(): | |||
txt = "\n".join( | |||
[b["text"] for b in bxs | |||
if not re.match(r"[0-9a-z.\+%-]", b["text"].strip()) | |||
and len(b["text"].strip()) >= 4 | |||
] | |||
) | |||
txt = "\n".join([b["text"] for b in bxs]) | |||
if not txt: | |||
continue | |||
@@ -96,7 +96,7 @@ class LayoutRecognizer(Recognizer): | |||
continue | |||
bxs[i]["layoutno"] = f"{ty}-{ii}" | |||
bxs[i]["layout_type"] = lts_[ii]["type"] | |||
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure" | |||
i += 1 | |||
for lt in ["footer", "header", "reference", "figure caption", | |||
@@ -105,7 +105,7 @@ class LayoutRecognizer(Recognizer): | |||
# add box to figure layouts which has not text box | |||
for i, lt in enumerate( | |||
[lt for lt in lts if lt["type"] == "figure"]): | |||
[lt for lt in lts if lt["type"] in ["figure","equation"]]): | |||
if lt.get("visited"): | |||
continue | |||
lt = deepcopy(lt) |
@@ -21,7 +21,6 @@ from .operators import * | |||
import numpy as np | |||
import onnxruntime as ort | |||
from api.utils.file_utils import get_project_base_directory | |||
from .postprocess import build_post_process | |||
from rag.settings import cron_logger | |||
@@ -276,18 +276,18 @@ class Recognizer(object): | |||
def find_overlapped_with_threashold(box, boxes, thr=0.3): | |||
if not boxes: | |||
return | |||
max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0 | |||
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | |||
s, e = 0, len(boxes) | |||
for i in range(s, e): | |||
ov = Recognizer.overlapped_area(box, boxes[i]) | |||
_ov = Recognizer.overlapped_area(boxes[i], box) | |||
if (ov, _ov) < (max_overlaped, _max_overlaped): | |||
if (ov, _ov) < (max_overlapped, _max_overlapped): | |||
continue | |||
max_overlaped_i = i | |||
max_overlaped = ov | |||
_max_overlaped = _ov | |||
max_overlapped_i = i | |||
max_overlapped = ov | |||
_max_overlapped = _ov | |||
return max_overlaped_i | |||
return max_overlapped_i | |||
def preprocess(self, image_list): | |||
inputs = [] |
@@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
d = copy.deepcopy(doc) | |||
if pdf_parser: | |||
d["image"], poss = pdf_parser.crop(ck, need_position=True) | |||
add_positions(d, poss) | |||
add_positions(d, poss, from_page) | |||
ck = pdf_parser.remove_tag(ck) | |||
tokenize(d, ck, eng) | |||
res.append(d) | |||
@@ -112,7 +112,7 @@ if __name__ == "__main__": | |||
import sys | |||
def dummy(a, b): | |||
def dummy(prog=None, msg=""): | |||
pass | |||
@@ -82,8 +82,8 @@ class Dealer: | |||
) | |||
else: | |||
s = s.sort( | |||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, | |||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}}, | |||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, | |||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, | |||
{"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} | |||
) |