|
|
|
@@ -536,8 +536,13 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): |
|
|
|
cks[-1] += t |
|
|
|
tk_nums[-1] += tnum |
|
|
|
|
|
|
|
dels = get_delimiters(delimiter) |
|
|
|
for sec, pos in sections: |
|
|
|
add_chunk(sec, pos) |
|
|
|
splited_sec = re.split(r"(%s)" % dels, sec) |
|
|
|
for sub_sec in splited_sec: |
|
|
|
if re.match(f"^{dels}$", sub_sec): |
|
|
|
continue |
|
|
|
add_chunk(sub_sec, pos) |
|
|
|
|
|
|
|
return cks |
|
|
|
|
|
|
|
@@ -576,8 +581,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 |
|
|
|
result_images[-1] = concat_img(result_images[-1], image) |
|
|
|
tk_nums[-1] += tnum |
|
|
|
|
|
|
|
dels = get_delimiters(delimiter) |
|
|
|
for text, image in zip(texts, images): |
|
|
|
add_chunk(text, image) |
|
|
|
splited_sec = re.split(r"(%s)" % dels, text) |
|
|
|
for sub_sec in splited_sec: |
|
|
|
if re.match(f"^{dels}$", sub_sec): |
|
|
|
continue |
|
|
|
add_chunk(text, image) |
|
|
|
|
|
|
|
return cks, result_images |
|
|
|
|
|
|
|
@@ -640,8 +650,13 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): |
|
|
|
images[-1] = concat_img(images[-1], image) |
|
|
|
tk_nums[-1] += tnum |
|
|
|
|
|
|
|
dels = get_delimiters(delimiter) |
|
|
|
for sec, image in sections: |
|
|
|
add_chunk(sec, image, '') |
|
|
|
splited_sec = re.split(r"(%s)" % dels, sec) |
|
|
|
for sub_sec in splited_sec: |
|
|
|
if re.match(f"^{dels}$", sub_sec): |
|
|
|
continue |
|
|
|
add_chunk(sub_sec, image,"") |
|
|
|
|
|
|
|
return cks, images |
|
|
|
|
|
|
|
@@ -649,3 +664,20 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): |
|
|
|
def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]: |
|
|
|
pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) |
|
|
|
return re.findall(pattern, text, flags=re.DOTALL) |
|
|
|
|
|
|
|
|
|
|
|
def get_delimiters(delimiters: str): |
|
|
|
dels = [] |
|
|
|
s = 0 |
|
|
|
for m in re.finditer(r"`([^`]+)`", delimiters, re.I): |
|
|
|
f, t = m.span() |
|
|
|
dels.append(m.group(1)) |
|
|
|
dels.extend(list(delimiters[s: f])) |
|
|
|
s = t |
|
|
|
if s < len(delimiters): |
|
|
|
dels.extend(list(delimiters[s:])) |
|
|
|
dels = [re.escape(d) for d in dels if d] |
|
|
|
dels = [d for d in dels if d] |
|
|
|
dels_pattern = "|".join(dels) |
|
|
|
|
|
|
|
return dels_pattern |