Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import os
  17. import re
  18. from collections import Counter
  19. from copy import deepcopy
  20. import cv2
  21. import numpy as np
  22. from huggingface_hub import snapshot_download
  23. from api.utils.file_utils import get_project_base_directory
  24. from deepdoc.vision import Recognizer
  25. from deepdoc.vision.operators import nms
  26. class LayoutRecognizer(Recognizer):
  27. labels = [
  28. "_background_",
  29. "Text",
  30. "Title",
  31. "Figure",
  32. "Figure caption",
  33. "Table",
  34. "Table caption",
  35. "Header",
  36. "Footer",
  37. "Reference",
  38. "Equation",
  39. ]
  40. def __init__(self, domain):
  41. try:
  42. model_dir = os.path.join(
  43. get_project_base_directory(),
  44. "rag/res/deepdoc")
  45. super().__init__(self.labels, domain, model_dir)
  46. except Exception:
  47. model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
  48. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  49. local_dir_use_symlinks=False)
  50. super().__init__(self.labels, domain, model_dir)
  51. self.garbage_layouts = ["footer", "header", "reference"]
  52. self.client = None
  53. if os.environ.get("TENSORRT_DLA_SVR"):
  54. from deepdoc.vision.dla_cli import DLAClient
  55. self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
  56. def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
  57. def __is_garbage(b):
  58. patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  59. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  60. "\\(cid *: *[0-9]+ *\\)"
  61. ]
  62. return any([re.search(p, b["text"]) for p in patt])
  63. if self.client:
  64. layouts = self.client.predict(image_list)
  65. else:
  66. layouts = super().__call__(image_list, thr, batch_size)
  67. # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
  68. assert len(image_list) == len(ocr_res)
  69. # Tag layout type
  70. boxes = []
  71. assert len(image_list) == len(layouts)
  72. garbages = {}
  73. page_layout = []
  74. for pn, lts in enumerate(layouts):
  75. bxs = ocr_res[pn]
  76. lts = [{"type": b["type"],
  77. "score": float(b["score"]),
  78. "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor,
  79. "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
  80. "page_number": pn,
  81. } for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts]
  82. lts = self.sort_Y_firstly(lts, np.mean(
  83. [lt["bottom"] - lt["top"] for lt in lts]) / 2)
  84. lts = self.layouts_cleanup(bxs, lts)
  85. page_layout.append(lts)
  86. # Tag layout type, layouts are ready
  87. def findLayout(ty):
  88. nonlocal bxs, lts, self
  89. lts_ = [lt for lt in lts if lt["type"] == ty]
  90. i = 0
  91. while i < len(bxs):
  92. if bxs[i].get("layout_type"):
  93. i += 1
  94. continue
  95. if __is_garbage(bxs[i]):
  96. bxs.pop(i)
  97. continue
  98. ii = self.find_overlapped_with_threshold(bxs[i], lts_,
  99. thr=0.4)
  100. if ii is None: # belong to nothing
  101. bxs[i]["layout_type"] = ""
  102. i += 1
  103. continue
  104. lts_[ii]["visited"] = True
  105. keep_feats = [
  106. lts_[
  107. ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
  108. lts_[
  109. ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
  110. ]
  111. if drop and lts_[
  112. ii]["type"] in self.garbage_layouts and not any(keep_feats):
  113. if lts_[ii]["type"] not in garbages:
  114. garbages[lts_[ii]["type"]] = []
  115. garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  116. bxs.pop(i)
  117. continue
  118. bxs[i]["layoutno"] = f"{ty}-{ii}"
  119. bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
  120. ii]["type"] != "equation" else "figure"
  121. i += 1
  122. for lt in ["footer", "header", "reference", "figure caption",
  123. "table caption", "title", "table", "text", "figure", "equation"]:
  124. findLayout(lt)
  125. # add box to figure layouts which has not text box
  126. for i, lt in enumerate(
  127. [lt for lt in lts if lt["type"] in ["figure", "equation"]]):
  128. if lt.get("visited"):
  129. continue
  130. lt = deepcopy(lt)
  131. del lt["type"]
  132. lt["text"] = ""
  133. lt["layout_type"] = "figure"
  134. lt["layoutno"] = f"figure-{i}"
  135. bxs.append(lt)
  136. boxes.extend(bxs)
  137. ocr_res = boxes
  138. garbag_set = set()
  139. for k in garbages.keys():
  140. garbages[k] = Counter(garbages[k])
  141. for g, c in garbages[k].items():
  142. if c > 1:
  143. garbag_set.add(g)
  144. ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
  145. return ocr_res, page_layout
  146. def forward(self, image_list, thr=0.7, batch_size=16):
  147. return super().__call__(image_list, thr, batch_size)
  148. class LayoutRecognizer4YOLOv10(LayoutRecognizer):
  149. labels = [
  150. "title",
  151. "Text",
  152. "Reference",
  153. "Figure",
  154. "Figure caption",
  155. "Table",
  156. "Table caption",
  157. "Table caption",
  158. "Equation",
  159. "Figure caption",
  160. ]
  161. def __init__(self, domain):
  162. domain = "layout"
  163. super().__init__(domain)
  164. self.auto = False
  165. self.scaleFill = False
  166. self.scaleup = True
  167. self.stride = 32
  168. self.center = True
  169. def preprocess(self, image_list):
  170. inputs = []
  171. new_shape = self.input_shape # height, width
  172. for img in image_list:
  173. shape = img.shape[:2] # current shape [height, width]
  174. # Scale ratio (new / old)
  175. r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
  176. # Compute padding
  177. new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
  178. dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
  179. dw /= 2 # divide padding into 2 sides
  180. dh /= 2
  181. ww, hh = new_unpad
  182. img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
  183. img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
  184. top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
  185. left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
  186. img = cv2.copyMakeBorder(
  187. img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
  188. ) # add border
  189. img /= 255.0
  190. img = img.transpose(2, 0, 1)
  191. img = img[np.newaxis, :, :, :].astype(np.float32)
  192. inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
  193. return inputs
  194. def postprocess(self, boxes, inputs, thr):
  195. thr = 0.08
  196. boxes = np.squeeze(boxes)
  197. scores = boxes[:, 4]
  198. boxes = boxes[scores > thr, :]
  199. scores = scores[scores > thr]
  200. if len(boxes) == 0:
  201. return []
  202. class_ids = boxes[:, -1].astype(int)
  203. boxes = boxes[:, :4]
  204. boxes[:, 0] -= inputs["scale_factor"][2]
  205. boxes[:, 2] -= inputs["scale_factor"][2]
  206. boxes[:, 1] -= inputs["scale_factor"][3]
  207. boxes[:, 3] -= inputs["scale_factor"][3]
  208. input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
  209. inputs["scale_factor"][1]])
  210. boxes = np.multiply(boxes, input_shape, dtype=np.float32)
  211. unique_class_ids = np.unique(class_ids)
  212. indices = []
  213. for class_id in unique_class_ids:
  214. class_indices = np.where(class_ids == class_id)[0]
  215. class_boxes = boxes[class_indices, :]
  216. class_scores = scores[class_indices]
  217. class_keep_boxes = nms(class_boxes, class_scores, 0.45)
  218. indices.extend(class_indices[class_keep_boxes])
  219. return [{
  220. "type": self.label_list[class_ids[i]].lower(),
  221. "bbox": [float(t) for t in boxes[i].tolist()],
  222. "score": float(scores[i])
  223. } for i in indices]