| avatar = TextField(null=True, help_text="avatar base64 string") | avatar = TextField(null=True, help_text="avatar base64 string") | ||||
| language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") | language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese") | ||||
| color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") | color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark") | ||||
| timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai") | |||||
| last_login_time = DateTimeField(null=True) | last_login_time = DateTimeField(null=True) | ||||
| is_authenticated = CharField(max_length=1, null=False, default="1") | is_authenticated = CharField(max_length=1, null=False, default="1") | ||||
| is_active = CharField(max_length=1, null=False, default="1") | is_active = CharField(max_length=1, null=False, default="1") |
| while i < len(bxs) - 1: | while i < len(bxs) - 1: | ||||
| b = bxs[i] | b = bxs[i] | ||||
| b_ = bxs[i + 1] | b_ = bxs[i + 1] | ||||
| if b.get("layoutno", "0") != b_.get("layoutno", "1"): | |||||
| if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: | |||||
| i += 1 | i += 1 | ||||
| continue | continue | ||||
| if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3: | |||||
| # merge | |||||
| bxs[i]["x1"] = b_["x1"] | |||||
| bxs[i]["top"] = (b["top"] + b_["top"]) / 2 | |||||
| bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 | |||||
| bxs[i]["text"] += b_["text"] | |||||
| bxs.pop(i + 1) | |||||
| continue | |||||
| i += 1 | |||||
| continue | |||||
| dis_thr = 1 | dis_thr = 1 | ||||
| dis = b["x1"] - b_["x0"] | dis = b["x1"] - b_["x0"] | ||||
| tk, tv = nearest(tables) | tk, tv = nearest(tables) | ||||
| fk, fv = nearest(figures) | fk, fv = nearest(figures) | ||||
| if min(tv, fv) > 2000: | |||||
| i += 1 | |||||
| continue | |||||
| #if min(tv, fv) > 2000: | |||||
| # i += 1 | |||||
| # continue | |||||
| if tv < fv: | if tv < fv: | ||||
| tables[tk].insert(0, c) | tables[tk].insert(0, c) | ||||
| logging.debug( | logging.debug( | ||||
| # crop figure out and add caption | # crop figure out and add caption | ||||
| for k, bxs in figures.items(): | for k, bxs in figures.items(): | ||||
| txt = "\n".join( | |||||
| [b["text"] for b in bxs | |||||
| if not re.match(r"[0-9a-z.\+%-]", b["text"].strip()) | |||||
| and len(b["text"].strip()) >= 4 | |||||
| ] | |||||
| ) | |||||
| txt = "\n".join([b["text"] for b in bxs]) | |||||
| if not txt: | if not txt: | ||||
| continue | continue | ||||
| continue | continue | ||||
| bxs[i]["layoutno"] = f"{ty}-{ii}" | bxs[i]["layoutno"] = f"{ty}-{ii}" | ||||
| bxs[i]["layout_type"] = lts_[ii]["type"] | |||||
| bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure" | |||||
| i += 1 | i += 1 | ||||
| for lt in ["footer", "header", "reference", "figure caption", | for lt in ["footer", "header", "reference", "figure caption", | ||||
| # add box to figure layouts which has not text box | # add box to figure layouts which has not text box | ||||
| for i, lt in enumerate( | for i, lt in enumerate( | ||||
| [lt for lt in lts if lt["type"] == "figure"]): | |||||
| [lt for lt in lts if lt["type"] in ["figure","equation"]]): | |||||
| if lt.get("visited"): | if lt.get("visited"): | ||||
| continue | continue | ||||
| lt = deepcopy(lt) | lt = deepcopy(lt) |
| import numpy as np | import numpy as np | ||||
| import onnxruntime as ort | import onnxruntime as ort | ||||
| from api.utils.file_utils import get_project_base_directory | |||||
| from .postprocess import build_post_process | from .postprocess import build_post_process | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| def find_overlapped_with_threashold(box, boxes, thr=0.3): | def find_overlapped_with_threashold(box, boxes, thr=0.3): | ||||
| if not boxes: | if not boxes: | ||||
| return | return | ||||
| max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0 | |||||
| max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0 | |||||
| s, e = 0, len(boxes) | s, e = 0, len(boxes) | ||||
| for i in range(s, e): | for i in range(s, e): | ||||
| ov = Recognizer.overlapped_area(box, boxes[i]) | ov = Recognizer.overlapped_area(box, boxes[i]) | ||||
| _ov = Recognizer.overlapped_area(boxes[i], box) | _ov = Recognizer.overlapped_area(boxes[i], box) | ||||
| if (ov, _ov) < (max_overlaped, _max_overlaped): | |||||
| if (ov, _ov) < (max_overlapped, _max_overlapped): | |||||
| continue | continue | ||||
| max_overlaped_i = i | |||||
| max_overlaped = ov | |||||
| _max_overlaped = _ov | |||||
| max_overlapped_i = i | |||||
| max_overlapped = ov | |||||
| _max_overlapped = _ov | |||||
| return max_overlaped_i | |||||
| return max_overlapped_i | |||||
| def preprocess(self, image_list): | def preprocess(self, image_list): | ||||
| inputs = [] | inputs = [] |
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| if pdf_parser: | if pdf_parser: | ||||
| d["image"], poss = pdf_parser.crop(ck, need_position=True) | d["image"], poss = pdf_parser.crop(ck, need_position=True) | ||||
| add_positions(d, poss) | |||||
| add_positions(d, poss, from_page) | |||||
| ck = pdf_parser.remove_tag(ck) | ck = pdf_parser.remove_tag(ck) | ||||
| tokenize(d, ck, eng) | tokenize(d, ck, eng) | ||||
| res.append(d) | res.append(d) | ||||
| import sys | import sys | ||||
| def dummy(a, b): | |||||
| def dummy(prog=None, msg=""): | |||||
| pass | pass | ||||
| ) | ) | ||||
| else: | else: | ||||
| s = s.sort( | s = s.sort( | ||||
| {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, | |||||
| {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}}, | |||||
| {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, | |||||
| {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}}, | |||||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | {"create_time": {"order": "desc", "unmapped_type": "date"}}, | ||||
| {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} | {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} | ||||
| ) | ) |