Browse Source

Fix: order chunks from docx by positions. (#7979)

### What problem does this PR solve?

#7934

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.19.1
Kevin Hu 5 months ago
parent
commit
93f5df716f
No account linked to committer's email address
1 changed files with 2 additions and 1 deletions
  1. 2
    1
      rag/nlp/__init__.py

+ 2
- 1
rag/nlp/__init__.py View File

def tokenize_chunks_with_images(chunks, doc, eng, images): def tokenize_chunks_with_images(chunks, doc, eng, images):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ck, image in zip(chunks, images):
for ii, (ck, image) in enumerate(zip(chunks, images)):
if len(ck.strip()) == 0: if len(ck.strip()) == 0:
continue continue
logging.debug("-- {}".format(ck)) logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
d["image"] = image d["image"] = image
add_positions(d, [[ii]*5])
tokenize(d, ck, eng) tokenize(d, ck, eng)
res.append(d) res.append(d)
return res return res

Loading…
Cancel
Save