### What problem does this PR solve? Improve GraphRAG similarity sensitivity to numeric differences. #8444. ### Type of change - [x] Refactoringtags/v0.20.0
| return ans_list | return ans_list | ||||
| def _has_digit_in_2gram_diff(self, a, b): | |||||
| def to_2gram_set(s): | |||||
| return {s[i:i+2] for i in range(len(s) - 1)} | |||||
| set_a = to_2gram_set(a) | |||||
| set_b = to_2gram_set(b) | |||||
| diff = set_a ^ set_b | |||||
| return any(any(c.isdigit() for c in pair) for pair in diff) | |||||
| def is_similarity(self, a, b): | def is_similarity(self, a, b): | ||||
| if self._has_digit_in_2gram_diff(a, b): | |||||
| return False | |||||
| if is_english(a) and is_english(b): | if is_english(a) and is_english(b): | ||||
| if editdistance.eval(a, b) <= min(len(a), len(b)) // 2: | if editdistance.eval(a, b) <= min(len(a), len(b)) // 2: | ||||
| return True | return True |
| def is_english(texts): | def is_english(texts): | ||||
| eng = 0 | |||||
| if not texts: | if not texts: | ||||
| return False | return False | ||||
| for t in texts: | |||||
| if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()): | |||||
| eng += 1 | |||||
| if eng / len(texts) > 0.8: | |||||
| return True | |||||
| return False | |||||
| pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]") | |||||
| if isinstance(texts, str): | |||||
| texts = list(texts) | |||||
| elif isinstance(texts, list): | |||||
| texts = [t for t in texts if isinstance(t, str) and t.strip()] | |||||
| else: | |||||
| return False | |||||
| if not texts: | |||||
| return False | |||||
| eng = sum(1 for t in texts if pattern.fullmatch(t.strip())) | |||||
| return (eng / len(texts)) > 0.8 | |||||
| def is_chinese(text): | def is_chinese(text): |