Browse Source

bigger resolution for OCR (#2919)

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement
tags/v0.13.0
Kevin Hu 1 year ago
parent
commit
bfc07fe4f9
No account linked to committer's email address
1 changed files with 5 additions and 3 deletions
  1. 5
    3
      deepdoc/parser/pdf_parser.py

+ 5
- 3
deepdoc/parser/pdf_parser.py View File

fnm, str) else pdfplumber.open(BytesIO(fnm)) fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])] enumerate(self.pdf.pages[page_from:page_to])]
self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])]
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
self.pdf.pages[page_from:page_to]] self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages) self.total_page = len(self.pdf.pages)
self.is_english = False self.is_english = False


st = timer() st = timer()
for i, img in enumerate(self.page_images):
for i, img in enumerate(self.page_images_x2):
chars = self.page_chars[i] if not self.is_english else [] chars = self.page_chars[i] if not self.is_english else []
self.mean_height.append( self.mean_height.append(
np.median(sorted([c["height"] for c in chars])) if chars else 0 np.median(sorted([c["height"] for c in chars])) if chars else 0
self.mean_width.append( self.mean_width.append(
np.median(sorted([c["width"] for c in chars])) if chars else 8 np.median(sorted([c["width"] for c in chars])) if chars else 8
) )
self.page_cum_height.append(img.size[1] / zoomin)
self.page_cum_height.append(img.size[1] / zoomin/2)
j = 0 j = 0
while j + 1 < len(chars): while j + 1 < len(chars):
if chars[j]["text"] and chars[j + 1]["text"] \ if chars[j]["text"] and chars[j + 1]["text"] \
chars[j]["text"] += " " chars[j]["text"] += " "
j += 1 j += 1


self.__ocr(i + 1, img, chars, zoomin)
self.__ocr(i + 1, img, chars, zoomin*2)
if callback and i % 6 == 5: if callback and i % 6 == 5:
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
# print("OCR:", timer()-st) # print("OCR:", timer()-st)

Loading…
Cancel
Save