Browse Source

Fix:use the same logic to handle pos in tokenize_chunks_with_images (#8732)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/8719

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.20.0
Stephen Hu 3 months ago
parent
commit
00c954755e
No account linked to committer's email address
1 changed files with 6 additions and 3 deletions
  1. 6
    3
      rag/nlp/__init__.py

+ 6
- 3
rag/nlp/__init__.py View File

@@ -559,9 +559,6 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
if not texts or len(texts) != len(images):
return [], []
# Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
if isinstance(texts[0], tuple):
texts = [t[0] for t in texts]
cks = [""]
result_images = [None]
tk_nums = [0]
@@ -596,6 +593,12 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue
# if text is tuple, unpack it
if isinstance(text, tuple):
text_str = text[0]
text_pos = text[1] if len(text) > 1 else ""
add_chunk(text_str, image, text_pos)
else:
add_chunk(text, image)

return cks, result_images

Loading…
Cancel
Save