浏览代码

Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027)

### What problem does this PR solve?

Optimize OCR garbage identification to reduce unnecessary filtering.
#5713

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.18.0
Yongteng Lei 7 个月前
父节点
当前提交
4ff609b6a8
没有帐户链接到提交者的电子邮件
共有 2 个文件被更改,包括 17 次插入17 次删除
  1. 6
    7
      deepdoc/vision/layout_recognizer.py
  2. 11
    10
      rag/app/naive.py

+ 6
- 7
deepdoc/vision/layout_recognizer.py 查看文件

@@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer):
def __init__(self, domain):
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
get_project_base_directory(),
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
@@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer):
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
"(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
"\\(cid *: *[0-9]+ *\\)"
]
return any([re.search(p, b["text"]) for p in patt])
@@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer):
def forward(self, image_list, thr=0.7, batch_size=16):
return super().__call__(image_list, thr, batch_size)


class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
@@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):

def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
shape = img.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
@@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]


+ 11
- 10
rag/app/naive.py 查看文件

@@ -15,20 +15,21 @@
#

import logging
from tika import parser
import re
from functools import reduce
from io import BytesIO
from docx import Document
from timeit import default_timer as timer
import re

from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown
from PIL import Image
from tika import parser

from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError


class Docx(DocxParser):

正在加载...
取消
保存