|
|
|
|
|
|
|
|
def tokenize_chunks(chunks, doc, eng, pdf_parser=None): |
|
|
def tokenize_chunks(chunks, doc, eng, pdf_parser=None): |
|
|
res = [] |
|
|
res = [] |
|
|
# wrap up as es documents |
|
|
# wrap up as es documents |
|
|
for ck in chunks: |
|
|
|
|
|
|
|
|
for ii, ck in enumerate(chunks): |
|
|
if len(ck.strip()) == 0: |
|
|
if len(ck.strip()) == 0: |
|
|
continue |
|
|
continue |
|
|
logging.debug("-- {}".format(ck)) |
|
|
logging.debug("-- {}".format(ck)) |
|
|
|
|
|
|
|
|
ck = pdf_parser.remove_tag(ck) |
|
|
ck = pdf_parser.remove_tag(ck) |
|
|
except NotImplementedError: |
|
|
except NotImplementedError: |
|
|
pass |
|
|
pass |
|
|
|
|
|
else: |
|
|
|
|
|
add_positions(d, [[ii]*5]) |
|
|
tokenize(d, ck, eng) |
|
|
tokenize(d, ck, eng) |
|
|
res.append(d) |
|
|
res.append(d) |
|
|
return res |
|
|
return res |