Browse Source

Upgrades Document Layout Analysis model. (#4054)

### What problem does this PR solve?

#4052

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.15.0
Kevin Hu 10 months ago
parent
commit
ce1e855328
No account linked to committer's email address

+ 2
- 2
api/db/services/task_service.py View File

task["progress"] = 0.0 task["progress"] = 0.0


prev_tasks = TaskService.get_tasks(doc["id"]) prev_tasks = TaskService.get_tasks(doc["id"])
ck_num = 0
if prev_tasks: if prev_tasks:
ck_num = 0
for task in tsks: for task in tsks:
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config) ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
TaskService.filter_delete([Task.doc_id == doc["id"]]) TaskService.filter_delete([Task.doc_id == doc["id"]])
chunk_ids.extend(task["chunk_ids"].split()) chunk_ids.extend(task["chunk_ids"].split())
if chunk_ids: if chunk_ids:
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"]) settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})


bulk_insert_into_db(Task, tsks, True) bulk_insert_into_db(Task, tsks, True)
DocumentService.begin2parse(doc["id"]) DocumentService.begin2parse(doc["id"])

+ 2
- 0
conf/infinity_mapping.json View File

"content_with_weight": {"type": "varchar", "default": ""}, "content_with_weight": {"type": "varchar", "default": ""},
"content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_sm_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"page_num_int": {"type": "varchar", "default": ""}, "page_num_int": {"type": "varchar", "default": ""},
"top_int": {"type": "varchar", "default": ""}, "top_int": {"type": "varchar", "default": ""},
"position_int": {"type": "varchar", "default": ""}, "position_int": {"type": "varchar", "default": ""},

+ 2
- 1
deepdoc/vision/__init__.py View File



from .ocr import OCR from .ocr import OCR
from .recognizer import Recognizer from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer from .table_structure_recognizer import TableStructureRecognizer



def init_in_out(args): def init_in_out(args):
from PIL import Image from PIL import Image
import os import os

+ 88
- 0
deepdoc/vision/layout_recognizer.py View File

import re import re
from collections import Counter from collections import Counter
from copy import deepcopy from copy import deepcopy

import cv2
import numpy as np import numpy as np
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download


from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer from deepdoc.vision import Recognizer
from deepdoc.vision.operators import nms




class LayoutRecognizer(Recognizer): class LayoutRecognizer(Recognizer):


ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set] ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
return ocr_res, page_layout return ocr_res, page_layout


class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
"Text",
"Reference",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Table caption",
"Equation",
"Figure caption",
]

def __init__(self, domain):
domain = "layout"
super().__init__(domain)
self.auto = False
self.scaleFill = False
self.scaleup = True
self.stride = 32
self.center = True

def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
ww, hh = new_unpad
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
) # add border
img /= 255.0
img = img.transpose(2, 0, 1)
img = img[np.newaxis, :, :, :].astype(np.float32)
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})

return inputs

def postprocess(self, boxes, inputs, thr):
thr = 0.08
boxes = np.squeeze(boxes)
scores = boxes[:, 4]
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
class_ids = boxes[:, -1].astype(int)
boxes = boxes[:, :4]
boxes[:, 0] -= inputs["scale_factor"][2]
boxes[:, 2] -= inputs["scale_factor"][2]
boxes[:, 1] -= inputs["scale_factor"][3]
boxes[:, 3] -= inputs["scale_factor"][3]
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
inputs["scale_factor"][1]])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)

unique_class_ids = np.unique(class_ids)
indices = []
for class_id in unique_class_ids:
class_indices = np.where(class_ids == class_id)[0]
class_boxes = boxes[class_indices, :]
class_scores = scores[class_indices]
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
indices.extend(class_indices[class_keep_boxes])

return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]


+ 26
- 0
deepdoc/vision/operators.py View File

for operator in preprocess_ops: for operator in preprocess_ops:
im, im_info = operator(im, im_info) im, im_info = operator(im, im_info)
return im, im_info return im, im_info


def nms(bboxes, scores, iou_thresh):
import numpy as np
x1 = bboxes[:, 0]
y1 = bboxes[:, 1]
x2 = bboxes[:, 2]
y2 = bboxes[:, 3]
areas = (y2 - y1) * (x2 - x1)

indices = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
indices.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= iou_thresh)[0]
index = index[idx + 1]
return indices

Loading…
Cancel
Save