|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- import re
-
- from nltk import word_tokenize
-
- from rag.nlp import stemmer, huqie
-
- BULLET_PATTERN = [[
- r"第[零一二三四五六七八九十百]+(编|部分)",
- r"第[零一二三四五六七八九十百]+章",
- r"第[零一二三四五六七八九十百]+节",
- r"第[零一二三四五六七八九十百]+条",
- r"[\((][零一二三四五六七八九十百]+[\))]",
- ], [
- r"[0-9]{,3}[\. 、]",
- r"[0-9]{,2}\.[0-9]{,2}",
- r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
- r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
- ], [
- r"第[零一二三四五六七八九十百]+章",
- r"第[零一二三四五六七八九十百]+节",
- r"[零一二三四五六七八九十百]+[ 、]",
- r"[\((][零一二三四五六七八九十百]+[\))]",
- r"[\((][0-9]{,2}[\))]",
- ] ,[
- r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
- r"Chapter (I+V?|VI*|XI|IX|X)",
- r"Section [0-9]+",
- r"Article [0-9]+"
- ]
- ]
-
-
- def bullets_category(sections):
- global BULLET_PATTERN
- hits = [0] * len(BULLET_PATTERN)
- for i, pro in enumerate(BULLET_PATTERN):
- for sec in sections:
- for p in pro:
- if re.match(p, sec):
- hits[i] += 1
- break
- maxium = 0
- res = -1
- for i,h in enumerate(hits):
- if h <= maxium:continue
- res = i
- maxium = h
- return res
-
- def is_english(texts):
- eng = 0
- for t in texts:
- if re.match(r"[a-zA-Z]{2,}", t.strip()):
- eng += 1
- if eng / len(texts) > 0.8:
- return True
- return False
-
- def tokenize(d, t, eng):
- d["content_with_weight"] = t
- if eng:
- t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
- d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
- else:
- d["content_ltks"] = huqie.qie(t)
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
-
-
- def remove_contents_table(sections, eng=False):
- i = 0
- while i < len(sections):
- def get(i):
- nonlocal sections
- return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
- if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
- i += 1
- continue
- sections.pop(i)
- if i >= len(sections): break
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
- while not prefix:
- sections.pop(i)
- if i >= len(sections): break
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
- sections.pop(i)
- if i >= len(sections) or not prefix: break
- for j in range(i, min(i+128, len(sections))):
- if not re.match(prefix, get(j)):
- continue
- for _ in range(i, j):sections.pop(i)
- break
|