### What problem does this PR solve? Fix tokenizer resulting in low recall ![37743d3a49](https://github.com/user-attachments/assets/1394757e-8fcb-4f87-96af-a92716144884) ![4aba633a17](https://github.com/user-attachments/assets/a1828e32-3e17-4394-a633-ba3f09bd506d) ![image](https://github.com/user-attachments/assets/61308f32-2a4f-44d5-a034-d65bbec554ef) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>

il y a 5 mois · 0d7cfce6e1
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -71,7 +71,19 @@ class FulltextQueryer:
            txt = otxt
        return txt

    @staticmethod
    def add_space_between_eng_zh(txt):
        # (ENG/ENG+NUM) + ZH
        txt = re.sub(r'([A-Za-z]+[0-9]+)([\u4e00-\u9fa5]+)', r'\1 \2', txt)
        # ENG + ZH
        txt = re.sub(r'([A-Za-z])([\u4e00-\u9fa5]+)', r'\1 \2', txt)
        # ZH + (ENG/ENG+NUM)
        txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z]+[0-9]+)', r'\1 \2', txt)
        txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z])', r'\1 \2', txt)
        return txt

    def question(self, txt, tbl="qa", min_match: float = 0.6):
        txt = FulltextQueryer.add_space_between_eng_zh(txt)
        txt = re.sub(
            r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
            " ",