Преглед изворни кода

Fix: add advanced delimiter detection for naive merge (#7941)

### What problem does this PR solve?

Add advanced delimiter detection for naive merge. #7824

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
tags/v0.19.1
Yongteng Lei пре 5 месеци
родитељ
комит
46963ab1ca
No account linked to committer's email address
1 измењених фајлова са 35 додато и 3 уклоњено
  1. 35
    3
      rag/nlp/__init__.py

+ 35
- 3
rag/nlp/__init__.py Прегледај датотеку

@@ -536,8 +536,13 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
cks[-1] += t
tk_nums[-1] += tnum

dels = get_delimiters(delimiter)
for sec, pos in sections:
add_chunk(sec, pos)
splited_sec = re.split(r"(%s)" % dels, sec)
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk(sub_sec, pos)

return cks
@@ -576,8 +581,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
result_images[-1] = concat_img(result_images[-1], image)
tk_nums[-1] += tnum

dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
add_chunk(text, image)
splited_sec = re.split(r"(%s)" % dels, text)
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk(text, image)

return cks, result_images

@@ -640,8 +650,13 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
images[-1] = concat_img(images[-1], image)
tk_nums[-1] += tnum

dels = get_delimiters(delimiter)
for sec, image in sections:
add_chunk(sec, image, '')
splited_sec = re.split(r"(%s)" % dels, sec)
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk(sub_sec, image,"")

return cks, images

@@ -649,3 +664,20 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]:
pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
return re.findall(pattern, text, flags=re.DOTALL)


def get_delimiters(delimiters: str):
dels = []
s = 0
for m in re.finditer(r"`([^`]+)`", delimiters, re.I):
f, t = m.span()
dels.append(m.group(1))
dels.extend(list(delimiters[s: f]))
s = t
if s < len(delimiters):
dels.extend(list(delimiters[s:]))
dels = [re.escape(d) for d in dels if d]
dels = [d for d in dels if d]
dels_pattern = "|".join(dels)

return dels_pattern

Loading…
Откажи
Сачувај