Browse Source

Feat: text file support position retaining. (#6231)

### What problem does this PR solve?

#5832

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.18.0
Kevin Hu 7 months ago
parent
commit
a087d13ccb
No account linked to committer's email address
1 changed files with 3 additions and 1 deletions
  1. 3
    1
      rag/nlp/__init__.py

+ 3
- 1
rag/nlp/__init__.py View File

def tokenize_chunks(chunks, doc, eng, pdf_parser=None): def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ck in chunks:
for ii, ck in enumerate(chunks):
if len(ck.strip()) == 0: if len(ck.strip()) == 0:
continue continue
logging.debug("-- {}".format(ck)) logging.debug("-- {}".format(ck))
ck = pdf_parser.remove_tag(ck) ck = pdf_parser.remove_tag(ck)
except NotImplementedError: except NotImplementedError:
pass pass
else:
add_positions(d, [[ii]*5])
tokenize(d, ck, eng) tokenize(d, ck, eng)
res.append(d) res.append(d)
return res return res

Loading…
Cancel
Save