|
|
|
|
|
|
|
|
def tokenize_chunks_with_images(chunks, doc, eng, images): |
|
|
def tokenize_chunks_with_images(chunks, doc, eng, images): |
|
|
res = [] |
|
|
res = [] |
|
|
# wrap up as es documents |
|
|
# wrap up as es documents |
|
|
for ck, image in zip(chunks, images): |
|
|
|
|
|
|
|
|
for ii, (ck, image) in enumerate(zip(chunks, images)): |
|
|
if len(ck.strip()) == 0: |
|
|
if len(ck.strip()) == 0: |
|
|
continue |
|
|
continue |
|
|
logging.debug("-- {}".format(ck)) |
|
|
logging.debug("-- {}".format(ck)) |
|
|
d = copy.deepcopy(doc) |
|
|
d = copy.deepcopy(doc) |
|
|
d["image"] = image |
|
|
d["image"] = image |
|
|
|
|
|
add_positions(d, [[ii]*5]) |
|
|
tokenize(d, ck, eng) |
|
|
tokenize(d, ck, eng) |
|
|
res.append(d) |
|
|
res.append(d) |
|
|
return res |
|
|
return res |