You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import re
  2. from nltk import word_tokenize
  3. from rag.nlp import stemmer, huqie
  4. BULLET_PATTERN = [[
  5. r"第[零一二三四五六七八九十百]+(编|部分)",
  6. r"第[零一二三四五六七八九十百]+章",
  7. r"第[零一二三四五六七八九十百]+节",
  8. r"第[零一二三四五六七八九十百]+条",
  9. r"[\((][零一二三四五六七八九十百]+[\))]",
  10. ], [
  11. r"[0-9]{,3}[\. 、]",
  12. r"[0-9]{,2}\.[0-9]{,2}",
  13. r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
  14. r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
  15. ], [
  16. r"第[零一二三四五六七八九十百]+章",
  17. r"第[零一二三四五六七八九十百]+节",
  18. r"[零一二三四五六七八九十百]+[ 、]",
  19. r"[\((][零一二三四五六七八九十百]+[\))]",
  20. r"[\((][0-9]{,2}[\))]",
  21. ] ,[
  22. r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
  23. r"Chapter (I+V?|VI*|XI|IX|X)",
  24. r"Section [0-9]+",
  25. r"Article [0-9]+"
  26. ]
  27. ]
  28. def bullets_category(sections):
  29. global BULLET_PATTERN
  30. hits = [0] * len(BULLET_PATTERN)
  31. for i, pro in enumerate(BULLET_PATTERN):
  32. for sec in sections:
  33. for p in pro:
  34. if re.match(p, sec):
  35. hits[i] += 1
  36. break
  37. maxium = 0
  38. res = -1
  39. for i,h in enumerate(hits):
  40. if h <= maxium:continue
  41. res = i
  42. maxium = h
  43. return res
  44. def is_english(texts):
  45. eng = 0
  46. for t in texts:
  47. if re.match(r"[a-zA-Z]{2,}", t.strip()):
  48. eng += 1
  49. if eng / len(texts) > 0.8:
  50. return True
  51. return False
  52. def tokenize(d, t, eng):
  53. d["content_with_weight"] = t
  54. if eng:
  55. t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
  56. d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
  57. else:
  58. d["content_ltks"] = huqie.qie(t)
  59. d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
  60. def remove_contents_table(sections, eng=False):
  61. i = 0
  62. while i < len(sections):
  63. def get(i):
  64. nonlocal sections
  65. return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
  66. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
  67. i += 1
  68. continue
  69. sections.pop(i)
  70. if i >= len(sections): break
  71. prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
  72. while not prefix:
  73. sections.pop(i)
  74. if i >= len(sections): break
  75. prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
  76. sections.pop(i)
  77. if i >= len(sections) or not prefix: break
  78. for j in range(i, min(i+128, len(sections))):
  79. if not re.match(prefix, get(j)):
  80. continue
  81. for _ in range(i, j):sections.pop(i)
  82. break