Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import copy
  2. import re
  3. from rag.app import laws
  4. from rag.parser import is_english, tokenize, naive_merge
  5. from rag.nlp import huqie
  6. from rag.parser.pdf_parser import HuParser
  7. from rag.settings import cron_logger
  8. class Pdf(HuParser):
  9. def __call__(self, filename, binary=None, from_page=0,
  10. to_page=100000, zoomin=3, callback=None):
  11. self.__images__(
  12. filename if not binary else binary,
  13. zoomin,
  14. from_page,
  15. to_page)
  16. callback(0.1, "OCR finished")
  17. from timeit import default_timer as timer
  18. start = timer()
  19. self._layouts_paddle(zoomin)
  20. callback(0.77, "Layout analysis finished")
  21. cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
  22. self._naive_vertical_merge()
  23. return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
  24. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
  25. doc = {
  26. "docnm_kwd": filename,
  27. "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
  28. }
  29. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  30. pdf_parser = None
  31. sections = []
  32. if re.search(r"\.docx?$", filename, re.IGNORECASE):
  33. callback(0.1, "Start to parse.")
  34. for txt in laws.Docx()(filename, binary):
  35. sections.append((txt, ""))
  36. callback(0.8, "Finish parsing.")
  37. elif re.search(r"\.pdf$", filename, re.IGNORECASE):
  38. pdf_parser = Pdf()
  39. sections = pdf_parser(filename if not binary else binary,
  40. from_page=from_page, to_page=to_page, callback=callback)
  41. elif re.search(r"\.txt$", filename, re.IGNORECASE):
  42. callback(0.1, "Start to parse.")
  43. txt = ""
  44. if binary:txt = binary.decode("utf-8")
  45. else:
  46. with open(filename, "r") as f:
  47. while True:
  48. l = f.readline()
  49. if not l:break
  50. txt += l
  51. sections = txt.split("\n")
  52. sections = [(l,"") for l in sections if l]
  53. callback(0.8, "Finish parsing.")
  54. else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
  55. parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;!?"})
  56. cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"])
  57. eng = is_english(cks)
  58. res = []
  59. # wrap up to es documents
  60. for ck in cks:
  61. print("--", ck)
  62. d = copy.deepcopy(doc)
  63. if pdf_parser:
  64. d["image"] = pdf_parser.crop(ck)
  65. ck = pdf_parser.remove_tag(ck)
  66. tokenize(d, ck, eng)
  67. res.append(d)
  68. return res
  69. if __name__ == "__main__":
  70. import sys
  71. def dummy(a, b):
  72. pass
  73. chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)