You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. # -*- coding: utf-8 -*-
  2. import copy
  3. import datrie
  4. import math
  5. import os
  6. import re
  7. import string
  8. import sys
  9. from hanziconv import HanziConv
  10. from api.utils.file_utils import get_project_base_directory
  11. class Huqie:
  12. def key_(self, line):
  13. return str(line.lower().encode("utf-8"))[2:-1]
  14. def rkey_(self, line):
  15. return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
  16. def loadDict_(self, fnm):
  17. print("[HUQIE]:Build trie", fnm, file=sys.stderr)
  18. try:
  19. of = open(fnm, "r")
  20. while True:
  21. line = of.readline()
  22. if not line:
  23. break
  24. line = re.sub(r"[\r\n]+", "", line)
  25. line = re.split(r"[ \t]", line)
  26. k = self.key_(line[0])
  27. F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
  28. if k not in self.trie_ or self.trie_[k][0] < F:
  29. self.trie_[self.key_(line[0])] = (F, line[2])
  30. self.trie_[self.rkey_(line[0])] = 1
  31. self.trie_.save(fnm + ".trie")
  32. of.close()
  33. except Exception as e:
  34. print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
  35. def __init__(self, debug=False):
  36. self.DEBUG = debug
  37. self.DENOMINATOR = 1000000
  38. self.trie_ = datrie.Trie(string.printable)
  39. self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
  40. self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
  41. try:
  42. self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
  43. return
  44. except Exception as e:
  45. print("[HUQIE]:Build default trie", file=sys.stderr)
  46. self.trie_ = datrie.Trie(string.printable)
  47. self.loadDict_(self.DIR_ + ".txt")
  48. def loadUserDict(self, fnm):
  49. try:
  50. self.trie_ = datrie.Trie.load(fnm + ".trie")
  51. return
  52. except Exception as e:
  53. self.trie_ = datrie.Trie(string.printable)
  54. self.loadDict_(fnm)
  55. def addUserDict(self, fnm):
  56. self.loadDict_(fnm)
  57. def _strQ2B(self, ustring):
  58. """把字符串全角转半角"""
  59. rstring = ""
  60. for uchar in ustring:
  61. inside_code = ord(uchar)
  62. if inside_code == 0x3000:
  63. inside_code = 0x0020
  64. else:
  65. inside_code -= 0xfee0
  66. if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
  67. rstring += uchar
  68. else:
  69. rstring += chr(inside_code)
  70. return rstring
  71. def _tradi2simp(self, line):
  72. return HanziConv.toSimplified(line)
  73. def dfs_(self, chars, s, preTks, tkslist):
  74. MAX_L = 10
  75. res = s
  76. # if s > MAX_L or s>= len(chars):
  77. if s >= len(chars):
  78. tkslist.append(preTks)
  79. return res
  80. # pruning
  81. S = s + 1
  82. if s + 2 <= len(chars):
  83. t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
  84. if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
  85. self.key_(t2)):
  86. S = s + 2
  87. if len(preTks) > 2 and len(
  88. preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
  89. t1 = preTks[-1][0] + "".join(chars[s:s + 1])
  90. if self.trie_.has_keys_with_prefix(self.key_(t1)):
  91. S = s + 2
  92. ################
  93. for e in range(S, len(chars) + 1):
  94. t = "".join(chars[s:e])
  95. k = self.key_(t)
  96. if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
  97. break
  98. if k in self.trie_:
  99. pretks = copy.deepcopy(preTks)
  100. if k in self.trie_:
  101. pretks.append((t, self.trie_[k]))
  102. else:
  103. pretks.append((t, (-12, '')))
  104. res = max(res, self.dfs_(chars, e, pretks, tkslist))
  105. if res > s:
  106. return res
  107. t = "".join(chars[s:s + 1])
  108. k = self.key_(t)
  109. if k in self.trie_:
  110. preTks.append((t, self.trie_[k]))
  111. else:
  112. preTks.append((t, (-12, '')))
  113. return self.dfs_(chars, s + 1, preTks, tkslist)
  114. def freq(self, tk):
  115. k = self.key_(tk)
  116. if k not in self.trie_:
  117. return 0
  118. return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
  119. def tag(self, tk):
  120. k = self.key_(tk)
  121. if k not in self.trie_:
  122. return ""
  123. return self.trie_[k][1]
  124. def score_(self, tfts):
  125. B = 30
  126. F, L, tks = 0, 0, []
  127. for tk, (freq, tag) in tfts:
  128. F += freq
  129. L += 0 if len(tk) < 2 else 1
  130. tks.append(tk)
  131. F /= len(tks)
  132. L /= len(tks)
  133. if self.DEBUG:
  134. print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
  135. return tks, B / len(tks) + L + F
  136. def sortTks_(self, tkslist):
  137. res = []
  138. for tfts in tkslist:
  139. tks, s = self.score_(tfts)
  140. res.append((tks, s))
  141. return sorted(res, key=lambda x: x[1], reverse=True)
  142. def merge_(self, tks):
  143. patts = [
  144. (r"[ ]+", " "),
  145. (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
  146. ]
  147. # for p,s in patts: tks = re.sub(p, s, tks)
  148. # if split chars is part of token
  149. res = []
  150. tks = re.sub(r"[ ]+", " ", tks).split(" ")
  151. s = 0
  152. while True:
  153. if s >= len(tks):
  154. break
  155. E = s + 1
  156. for e in range(s + 2, min(len(tks) + 2, s + 6)):
  157. tk = "".join(tks[s:e])
  158. if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
  159. E = e
  160. res.append("".join(tks[s:E]))
  161. s = E
  162. return " ".join(res)
  163. def maxForward_(self, line):
  164. res = []
  165. s = 0
  166. while s < len(line):
  167. e = s + 1
  168. t = line[s:e]
  169. while e < len(line) and self.trie_.has_keys_with_prefix(
  170. self.key_(t)):
  171. e += 1
  172. t = line[s:e]
  173. while e - 1 > s and self.key_(t) not in self.trie_:
  174. e -= 1
  175. t = line[s:e]
  176. if self.key_(t) in self.trie_:
  177. res.append((t, self.trie_[self.key_(t)]))
  178. else:
  179. res.append((t, (0, '')))
  180. s = e
  181. return self.score_(res)
  182. def maxBackward_(self, line):
  183. res = []
  184. s = len(line) - 1
  185. while s >= 0:
  186. e = s + 1
  187. t = line[s:e]
  188. while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
  189. s -= 1
  190. t = line[s:e]
  191. while s + 1 < e and self.key_(t) not in self.trie_:
  192. s += 1
  193. t = line[s:e]
  194. if self.key_(t) in self.trie_:
  195. res.append((t, self.trie_[self.key_(t)]))
  196. else:
  197. res.append((t, (0, '')))
  198. s -= 1
  199. return self.score_(res[::-1])
  200. def qie(self, line):
  201. line = self._strQ2B(line).lower()
  202. line = self._tradi2simp(line)
  203. arr = re.split(self.SPLIT_CHAR, line)
  204. res = []
  205. for L in arr:
  206. if len(L) < 2 or re.match(
  207. r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
  208. res.append(L)
  209. continue
  210. # print(L)
  211. # use maxforward for the first time
  212. tks, s = self.maxForward_(L)
  213. tks1, s1 = self.maxBackward_(L)
  214. if self.DEBUG:
  215. print("[FW]", tks, s)
  216. print("[BW]", tks1, s1)
  217. diff = [0 for _ in range(max(len(tks1), len(tks)))]
  218. for i in range(min(len(tks1), len(tks))):
  219. if tks[i] != tks1[i]:
  220. diff[i] = 1
  221. if s1 > s:
  222. tks = tks1
  223. i = 0
  224. while i < len(tks):
  225. s = i
  226. while s < len(tks) and diff[s] == 0:
  227. s += 1
  228. if s == len(tks):
  229. res.append(" ".join(tks[i:]))
  230. break
  231. if s > i:
  232. res.append(" ".join(tks[i:s]))
  233. e = s
  234. while e < len(tks) and e - s < 5 and diff[e] == 1:
  235. e += 1
  236. tkslist = []
  237. self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
  238. res.append(" ".join(self.sortTks_(tkslist)[0][0]))
  239. i = e + 1
  240. res = " ".join(res)
  241. if self.DEBUG:
  242. print("[TKS]", self.merge_(res))
  243. return self.merge_(res)
  244. def qieqie(self, tks):
  245. res = []
  246. for tk in tks.split(" "):
  247. if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
  248. res.append(tk)
  249. continue
  250. tkslist = []
  251. if len(tk) > 10:
  252. tkslist.append(tk)
  253. else:
  254. self.dfs_(tk, 0, [], tkslist)
  255. if len(tkslist) < 2:
  256. res.append(tk)
  257. continue
  258. stk = self.sortTks_(tkslist)[1][0]
  259. if len(stk) == len(tk):
  260. stk = tk
  261. else:
  262. if re.match(r"[a-z\.-]+$", tk):
  263. for t in stk:
  264. if len(t) < 3:
  265. stk = tk
  266. break
  267. else:
  268. stk = " ".join(stk)
  269. else:
  270. stk = " ".join(stk)
  271. res.append(stk)
  272. return " ".join(res)
  273. def is_chinese(s):
  274. if s >= u'\u4e00' and s <= u'\u9fa5':
  275. return True
  276. else:
  277. return False
  278. def is_number(s):
  279. if s >= u'\u0030' and s <= u'\u0039':
  280. return True
  281. else:
  282. return False
  283. def is_alphabet(s):
  284. if (s >= u'\u0041' and s <= u'\u005a') or (
  285. s >= u'\u0061' and s <= u'\u007a'):
  286. return True
  287. else:
  288. return False
  289. def naiveQie(txt):
  290. tks = []
  291. for t in txt.split(" "):
  292. if tks and re.match(r".*[a-zA-Z]$", tks[-1]
  293. ) and re.match(r".*[a-zA-Z]$", t):
  294. tks.append(" ")
  295. tks.append(t)
  296. return tks
  297. hq = Huqie()
  298. qie = hq.qie
  299. qieqie = hq.qieqie
  300. tag = hq.tag
  301. freq = hq.freq
  302. loadUserDict = hq.loadUserDict
  303. addUserDict = hq.addUserDict
  304. tradi2simp = hq._tradi2simp
  305. strQ2B = hq._strQ2B
  306. if __name__ == '__main__':
  307. huqie = Huqie(debug=True)
  308. # huqie.addUserDict("/tmp/tmp.new.tks.dict")
  309. tks = huqie.qie(
  310. "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
  311. print(huqie.qieqie(tks))
  312. tks = huqie.qie(
  313. "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
  314. print(huqie.qieqie(tks))
  315. tks = huqie.qie(
  316. "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
  317. print(huqie.qieqie(tks))
  318. tks = huqie.qie(
  319. "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
  320. print(huqie.qieqie(tks))
  321. tks = huqie.qie("虽然我不怎么玩")
  322. print(huqie.qieqie(tks))
  323. tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
  324. print(huqie.qieqie(tks))
  325. tks = huqie.qie(
  326. "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
  327. print(huqie.qieqie(tks))
  328. tks = huqie.qie("这周日你去吗?这周日你有空吗?")
  329. print(huqie.qieqie(tks))
  330. tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
  331. print(huqie.qieqie(tks))
  332. tks = huqie.qie(
  333. "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
  334. print(huqie.qieqie(tks))
  335. if len(sys.argv) < 2:
  336. sys.exit()
  337. huqie.DEBUG = False
  338. huqie.loadUserDict(sys.argv[1])
  339. of = open(sys.argv[2], "r")
  340. while True:
  341. line = of.readline()
  342. if not line:
  343. break
  344. print(huqie.qie(line))
  345. of.close()