您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

huchunk.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. import re
  2. import os
  3. import copy
  4. import base64
  5. import magic
  6. from dataclasses import dataclass
  7. from typing import List
  8. import numpy as np
  9. from io import BytesIO
  10. class HuChunker:
  11. def __init__(self):
  12. self.MAX_LVL = 12
  13. self.proj_patt = [
  14. (r"第[零一二三四五六七八九十百]+章", 1),
  15. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  16. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  17. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  18. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  19. (r"[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 6),
  20. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  21. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  22. (r".{,48}[::??]@", 9),
  23. (r"[0-9]+)", 10),
  24. (r"[\((][0-9]+[)\)]", 11),
  25. (r"[零一二三四五六七八九十百]+是", 12),
  26. (r"[⚫•➢✓ ]", 12)
  27. ]
  28. self.lines = []
  29. def _garbage(self, txt):
  30. patt = [
  31. r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
  32. r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
  33. r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
  34. r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
  35. r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
  36. r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
  37. r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
  38. r"^(时间|签字|签章)[::]",
  39. r"(参考文献|目录索引|图表索引)",
  40. r"[ ]*年[ ]+月[ ]+日",
  41. r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
  42. r"\.{10,}",
  43. r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
  44. ]
  45. return any([re.search(p, txt) for p in patt])
  46. def _proj_match(self, line):
  47. for p, j in self.proj_patt:
  48. if re.match(p, line):
  49. return j
  50. return
  51. def _does_proj_match(self):
  52. mat = [None for _ in range(len(self.lines))]
  53. for i in range(len(self.lines)):
  54. mat[i] = self._proj_match(self.lines[i])
  55. return mat
  56. def naive_text_chunk(self, text, ti="", MAX_LEN=612):
  57. if text:
  58. self.lines = [l.strip().replace(u'\u3000', u' ')
  59. .replace(u'\xa0', u'')
  60. for l in text.split("\n\n")]
  61. self.lines = [l for l in self.lines if not self._garbage(l)]
  62. self.lines = [re.sub(r"([ ]+| )", " ", l)
  63. for l in self.lines if l]
  64. if not self.lines:
  65. return []
  66. arr = self.lines
  67. res = [""]
  68. i = 0
  69. while i < len(arr):
  70. a = arr[i]
  71. if not a:
  72. i += 1
  73. continue
  74. if len(a) > MAX_LEN:
  75. a_ = a.split("\n")
  76. if len(a_) >= 2:
  77. arr.pop(i)
  78. for j in range(2, len(a_) + 1):
  79. if len("\n".join(a_[:j])) >= MAX_LEN:
  80. arr.insert(i, "\n".join(a_[:j - 1]))
  81. arr.insert(i + 1, "\n".join(a_[j - 1:]))
  82. break
  83. else:
  84. assert False, f"Can't split: {a}"
  85. continue
  86. if len(res[-1]) < MAX_LEN / 3:
  87. res[-1] += "\n" + a
  88. else:
  89. res.append(a)
  90. i += 1
  91. if ti:
  92. for i in range(len(res)):
  93. if res[i].find("——来自") >= 0:
  94. continue
  95. res[i] += f"\t——来自“{ti}”"
  96. return res
  97. def _merge(self):
  98. # merge continuous same level text
  99. lines = [self.lines[0]] if self.lines else []
  100. for i in range(1, len(self.lines)):
  101. if self.mat[i] == self.mat[i - 1] \
  102. and len(lines[-1]) < 256 \
  103. and len(self.lines[i]) < 256:
  104. lines[-1] += "\n" + self.lines[i]
  105. continue
  106. lines.append(self.lines[i])
  107. self.lines = lines
  108. self.mat = self._does_proj_match()
  109. return self.mat
  110. def text_chunks(self, text):
  111. if text:
  112. self.lines = [l.strip().replace(u'\u3000', u' ')
  113. .replace(u'\xa0', u'')
  114. for l in re.split(r"[\r\n]", text)]
  115. self.lines = [l for l in self.lines if not self._garbage(l)]
  116. self.lines = [l for l in self.lines if l]
  117. self.mat = self._does_proj_match()
  118. mat = self._merge()
  119. tree = []
  120. for i in range(len(self.lines)):
  121. tree.append({"proj": mat[i],
  122. "children": [],
  123. "read": False})
  124. # find all children
  125. for i in range(len(self.lines) - 1):
  126. if tree[i]["proj"] is None:
  127. continue
  128. ed = i + 1
  129. while ed < len(tree) and (tree[ed]["proj"] is None or
  130. tree[ed]["proj"] > tree[i]["proj"]):
  131. ed += 1
  132. nxt = tree[i]["proj"] + 1
  133. st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
  134. while nxt not in st:
  135. nxt += 1
  136. if nxt > self.MAX_LVL:
  137. break
  138. if nxt <= self.MAX_LVL:
  139. for j in range(i + 1, ed):
  140. if tree[j]["proj"] is not None:
  141. break
  142. tree[i]["children"].append(j)
  143. for j in range(i + 1, ed):
  144. if tree[j]["proj"] != nxt:
  145. continue
  146. tree[i]["children"].append(j)
  147. else:
  148. for j in range(i + 1, ed):
  149. tree[i]["children"].append(j)
  150. # get DFS combinations, find all the paths to leaf
  151. paths = []
  152. def dfs(i, path):
  153. nonlocal tree, paths
  154. path.append(i)
  155. tree[i]["read"] = True
  156. if len(self.lines[i]) > 256:
  157. paths.append(path)
  158. return
  159. if not tree[i]["children"]:
  160. if len(path) > 1 or len(self.lines[i]) >= 32:
  161. paths.append(path)
  162. return
  163. for j in tree[i]["children"]:
  164. dfs(j, copy.deepcopy(path))
  165. for i, t in enumerate(tree):
  166. if t["read"]:
  167. continue
  168. dfs(i, [])
  169. # concat txt on the path for all paths
  170. res = []
  171. lines = np.array(self.lines)
  172. for p in paths:
  173. if len(p) < 2:
  174. tree[p[0]]["read"] = False
  175. continue
  176. txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
  177. res.append(txt)
  178. # concat continuous orphans
  179. assert len(tree) == len(lines)
  180. ii = 0
  181. while ii < len(tree):
  182. if tree[ii]["read"]:
  183. ii += 1
  184. continue
  185. txt = lines[ii]
  186. e = ii + 1
  187. while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
  188. txt += "\n" + lines[e]
  189. e += 1
  190. res.append(txt)
  191. ii = e
  192. # if the node has not been read, find its daddy
  193. def find_daddy(st):
  194. nonlocal lines, tree
  195. proj = tree[st]["proj"]
  196. if len(self.lines[st]) > 512:
  197. return [st]
  198. if proj is None:
  199. proj = self.MAX_LVL + 1
  200. for i in range(st - 1, -1, -1):
  201. if tree[i]["proj"] and tree[i]["proj"] < proj:
  202. a = [st] + find_daddy(i)
  203. return a
  204. return []
  205. return res
  206. class PdfChunker(HuChunker):
  207. @dataclass
  208. class Fields:
  209. text_chunks: List = None
  210. table_chunks: List = None
  211. def __init__(self, pdf_parser):
  212. self.pdf = pdf_parser
  213. super().__init__()
  214. def tableHtmls(self, pdfnm):
  215. _, tbls = self.pdf(pdfnm, return_html=True)
  216. res = []
  217. for img, arr in tbls:
  218. if arr[0].find("<table>") < 0:
  219. continue
  220. buffered = BytesIO()
  221. if img:
  222. img.save(buffered, format="JPEG")
  223. img_str = base64.b64encode(
  224. buffered.getvalue()).decode('utf-8') if img else ""
  225. res.append({"table": arr[0], "image": img_str})
  226. return res
  227. def html(self, pdfnm):
  228. txts, tbls = self.pdf(pdfnm, return_html=True)
  229. res = []
  230. txt_cks = self.text_chunks(txts)
  231. for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
  232. for c in txt_cks]:
  233. buffered = BytesIO()
  234. if img:
  235. img.save(buffered, format="JPEG")
  236. img_str = base64.b64encode(
  237. buffered.getvalue()).decode('utf-8') if img else ""
  238. res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
  239. "image": img_str})
  240. for img, arr in tbls:
  241. if not arr:
  242. continue
  243. buffered = BytesIO()
  244. if img:
  245. img.save(buffered, format="JPEG")
  246. img_str = base64.b64encode(
  247. buffered.getvalue()).decode('utf-8') if img else ""
  248. res.append({"table": arr[0], "image": img_str})
  249. return res
  250. def __call__(self, pdfnm, return_image=True, naive_chunk=False):
  251. flds = self.Fields()
  252. text, tbls = self.pdf(pdfnm)
  253. fnm = pdfnm
  254. txt_cks = self.text_chunks(text) if not naive_chunk else \
  255. self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
  256. flds.text_chunks = [(self.pdf.remove_tag(c),
  257. self.pdf.crop(c) if return_image else None) for c in txt_cks]
  258. flds.table_chunks = [(arr, img if return_image else None)
  259. for img, arr in tbls]
  260. return flds
  261. class DocxChunker(HuChunker):
  262. @dataclass
  263. class Fields:
  264. text_chunks: List = None
  265. table_chunks: List = None
  266. def __init__(self, doc_parser):
  267. self.doc = doc_parser
  268. super().__init__()
  269. def _does_proj_match(self):
  270. mat = []
  271. for s in self.styles:
  272. s = s.split(" ")[-1]
  273. try:
  274. mat.append(int(s))
  275. except Exception as e:
  276. mat.append(None)
  277. return mat
  278. def _merge(self):
  279. i = 1
  280. while i < len(self.lines):
  281. if self.mat[i] == self.mat[i - 1] \
  282. and len(self.lines[i - 1]) < 256 \
  283. and len(self.lines[i]) < 256:
  284. self.lines[i - 1] += "\n" + self.lines[i]
  285. self.styles.pop(i)
  286. self.lines.pop(i)
  287. self.mat.pop(i)
  288. continue
  289. i += 1
  290. self.mat = self._does_proj_match()
  291. return self.mat
  292. def __call__(self, fnm):
  293. flds = self.Fields()
  294. flds.title = os.path.splitext(
  295. os.path.basename(fnm))[0] if isinstance(
  296. fnm, type("")) else ""
  297. secs, tbls = self.doc(fnm)
  298. self.lines = [l for l, s in secs]
  299. self.styles = [s for l, s in secs]
  300. txt_cks = self.text_chunks("")
  301. flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
  302. flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
  303. return flds
  304. class ExcelChunker(HuChunker):
  305. @dataclass
  306. class Fields:
  307. text_chunks: List = None
  308. table_chunks: List = None
  309. def __init__(self, excel_parser):
  310. self.excel = excel_parser
  311. super().__init__()
  312. def __call__(self, fnm):
  313. flds = self.Fields()
  314. flds.text_chunks = [(t, None) for t in self.excel(fnm)]
  315. flds.table_chunks = []
  316. return flds
  317. class PptChunker(HuChunker):
  318. @dataclass
  319. class Fields:
  320. text_chunks: List = None
  321. table_chunks: List = None
  322. def __init__(self):
  323. super().__init__()
  324. def __call__(self, fnm):
  325. from pptx import Presentation
  326. ppt = Presentation(fnm) if isinstance(
  327. fnm, str) else Presentation(
  328. BytesIO(fnm))
  329. flds = self.Fields()
  330. flds.text_chunks = []
  331. for slide in ppt.slides:
  332. for shape in slide.shapes:
  333. if hasattr(shape, "text"):
  334. flds.text_chunks.append((shape.text, None))
  335. flds.table_chunks = []
  336. return flds
  337. class TextChunker(HuChunker):
  338. @dataclass
  339. class Fields:
  340. text_chunks: List = None
  341. table_chunks: List = None
  342. def __init__(self):
  343. super().__init__()
  344. @staticmethod
  345. def is_binary_file(file_path):
  346. mime = magic.Magic(mime=True)
  347. if isinstance(file_path, str):
  348. file_type = mime.from_file(file_path)
  349. else:
  350. file_type = mime.from_buffer(file_path)
  351. if 'text' in file_type:
  352. return False
  353. else:
  354. return True
  355. def __call__(self, fnm):
  356. flds = self.Fields()
  357. if self.is_binary_file(fnm):
  358. return flds
  359. txt = ""
  360. if isinstance(fnm, str):
  361. with open(fnm, "r") as f:
  362. txt = f.read()
  363. else: txt = fnm.decode("utf-8")
  364. flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
  365. flds.table_chunks = []
  366. return flds
  367. if __name__ == "__main__":
  368. import sys
  369. sys.path.append(os.path.dirname(__file__) + "/../")
  370. if sys.argv[1].split(".")[-1].lower() == "pdf":
  371. from parser import PdfParser
  372. ckr = PdfChunker(PdfParser())
  373. if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
  374. from parser import DocxParser
  375. ckr = DocxChunker(DocxParser())
  376. if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
  377. from parser import ExcelParser
  378. ckr = ExcelChunker(ExcelParser())
  379. # ckr.html(sys.argv[1])
  380. print(ckr(sys.argv[1]))