You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import copy
  2. import re
  3. from rag.app import laws
  4. from rag.parser import is_english, tokenize, naive_merge
  5. from rag.nlp import huqie
  6. from rag.parser.pdf_parser import HuParser
  7. from rag.settings import cron_logger
  8. class Pdf(HuParser):
  9. def __call__(self, filename, binary=None, from_page=0,
  10. to_page=100000, zoomin=3, callback=None):
  11. self.__images__(
  12. filename if not binary else binary,
  13. zoomin,
  14. from_page,
  15. to_page)
  16. callback(0.1, "OCR finished")
  17. from timeit import default_timer as timer
  18. start = timer()
  19. self._layouts_paddle(zoomin)
  20. callback(0.77, "Layout analysis finished")
  21. cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
  22. self._naive_vertical_merge()
  23. return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
  24. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
  25. doc = {
  26. "docnm_kwd": filename,
  27. "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
  28. }
  29. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  30. pdf_parser = None
  31. sections = []
  32. if re.search(r"\.docx?$", filename, re.IGNORECASE):
  33. callback(0.1, "Start to parse.")
  34. for txt in laws.Docx()(filename, binary):
  35. sections.append((txt, ""))
  36. callback(0.8, "Finish parsing.")
  37. elif re.search(r"\.pdf$", filename, re.IGNORECASE):
  38. pdf_parser = Pdf()
  39. sections = pdf_parser(filename if not binary else binary,
  40. from_page=from_page, to_page=to_page, callback=callback)
  41. elif re.search(r"\.txt$", filename, re.IGNORECASE):
  42. callback(0.1, "Start to parse.")
  43. txt = ""
  44. if binary:txt = binary.decode("utf-8")
  45. else:
  46. with open(filename, "r") as f:
  47. while True:
  48. l = f.readline()
  49. if not l:break
  50. txt += l
  51. sections = txt.split("\n")
  52. sections = [(l,"") for l in sections if l]
  53. callback(0.8, "Finish parsing.")
  54. else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
  55. cks = naive_merge(sections, kwargs.get("chunk_token_num", 128), kwargs.get("delimer", "\n。;!?"))
  56. eng = is_english(cks)
  57. res = []
  58. # wrap up to es documents
  59. for ck in cks:
  60. print("--", ck)
  61. d = copy.deepcopy(doc)
  62. if pdf_parser:
  63. d["image"] = pdf_parser.crop(ck)
  64. ck = pdf_parser.remove_tag(ck)
  65. tokenize(d, ck, eng)
  66. res.append(d)
  67. return res
  68. if __name__ == "__main__":
  69. import sys
  70. def dummy(a, b):
  71. pass
  72. chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)