瀏覽代碼

Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027)

### What problem does this PR solve?

Optimize OCR garbage identification to reduce unnecessary filtering.
#5713

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.18.0
Yongteng Lei 7 月之前
父節點
當前提交
4ff609b6a8
沒有連結到貢獻者的電子郵件帳戶。
共有 2 個檔案被更改,包括 17 行新增17 行删除
  1. 6
    7
      deepdoc/vision/layout_recognizer.py
  2. 11
    10
      rag/app/naive.py

+ 6
- 7
deepdoc/vision/layout_recognizer.py 查看文件

@@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer):
def __init__(self, domain):
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
get_project_base_directory(),
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
@@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer):
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
"(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
"\\(cid *: *[0-9]+ *\\)"
]
return any([re.search(p, b["text"]) for p in patt])
@@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer):
def forward(self, image_list, thr=0.7, batch_size=16):
return super().__call__(image_list, thr, batch_size)


class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
@@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):

def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
shape = img.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
@@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]


+ 11
- 10
rag/app/naive.py 查看文件

@@ -15,20 +15,21 @@
#

import logging
from tika import parser
import re
from functools import reduce
from io import BytesIO
from docx import Document
from timeit import default_timer as timer
import re

from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown
from PIL import Image
from tika import parser

from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError


class Docx(DocxParser):

Loading…
取消
儲存