FIX: If chunk["content_with_weight"] contains one or more unpaired
surrogate characters (such as incomplete emoji or other special
characters), then calling .encode("utf-8") directly will raise a
UnicodeEncodeError.
### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.20.1
| @@ -284,7 +284,7 @@ async def build_chunks(task, progress_callback): | |||
| try: | |||
| d = copy.deepcopy(document) | |||
| d.update(chunk) | |||
| d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest() | |||
| d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest() | |||
| d["create_time"] = str(datetime.now()).replace("T", " ")[:19] | |||
| d["create_timestamp_flt"] = datetime.now().timestamp() | |||
| if not d.get("image"): | |||