|
|
|
@@ -33,14 +33,30 @@ class RAGFlowTxtParser: |
|
|
|
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): |
|
|
|
if type(txt) != str: |
|
|
|
raise TypeError("txt type should be str!") |
|
|
|
sections = [] |
|
|
|
for sec in re.split(r"[%s]+"%delimiter, txt): |
|
|
|
if sections and sec in delimiter: |
|
|
|
sections[-1][0] += sec |
|
|
|
continue |
|
|
|
if num_tokens_from_string(sec) > 10 * int(chunk_token_num): |
|
|
|
sections.append([sec[: int(len(sec) / 2)], ""]) |
|
|
|
sections.append([sec[int(len(sec) / 2) :], ""]) |
|
|
|
cks = [""] |
|
|
|
tk_nums = [0] |
|
|
|
|
|
|
|
def add_chunk(t): |
|
|
|
nonlocal cks, tk_nums, delimiter |
|
|
|
tnum = num_tokens_from_string(t) |
|
|
|
if tnum < 8: |
|
|
|
pos = "" |
|
|
|
if tk_nums[-1] > chunk_token_num: |
|
|
|
cks.append(t) |
|
|
|
tk_nums.append(tnum) |
|
|
|
else: |
|
|
|
cks[-1] += t |
|
|
|
tk_nums[-1] += tnum |
|
|
|
|
|
|
|
s, e = 0, 1 |
|
|
|
while e < len(txt): |
|
|
|
if txt[e] in delimiter: |
|
|
|
add_chunk(txt[s: e + 1]) |
|
|
|
s = e + 1 |
|
|
|
e = s + 1 |
|
|
|
else: |
|
|
|
sections.append([sec, ""]) |
|
|
|
return sections |
|
|
|
e += 1 |
|
|
|
if s < e: |
|
|
|
add_chunk(txt[s: e + 1]) |
|
|
|
|
|
|
|
return [[c,""] for c in cks] |