Quellcode durchsuchen

Refa: improve GraphRAG similarity sensitivity to numeric differences (#8479)

### What problem does this PR solve?

Improve GraphRAG similarity sensitivity to numeric differences. #8444.

### Type of change

- [x] Refactoring
tags/v0.20.0
Yongteng Lei vor 4 Monaten
Ursprung
Commit
b705ff08fe
Es ist kein Account mit der E-Mail-Adresse des Committers verbunden
2 geänderte Dateien mit 28 neuen und 7 gelöschten Zeilen
  1. 13
    0
      graphrag/entity_resolution.py
  2. 15
    7
      rag/nlp/__init__.py

+ 13
- 0
graphrag/entity_resolution.py Datei anzeigen

@@ -218,7 +218,20 @@ class EntityResolution(Extractor):

return ans_list

def _has_digit_in_2gram_diff(self, a, b):
def to_2gram_set(s):
return {s[i:i+2] for i in range(len(s) - 1)}

set_a = to_2gram_set(a)
set_b = to_2gram_set(b)
diff = set_a ^ set_b

return any(any(c.isdigit() for c in pair) for pair in diff)

def is_similarity(self, a, b):
if self._has_digit_in_2gram_diff(a, b):
return False

if is_english(a) and is_english(b):
if editdistance.eval(a, b) <= min(len(a), len(b)) // 2:
return True

+ 15
- 7
rag/nlp/__init__.py Datei anzeigen

@@ -225,15 +225,23 @@ def bullets_category(sections):


def is_english(texts):
eng = 0
if not texts:
return False
for t in texts:
if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()):
eng += 1
if eng / len(texts) > 0.8:
return True
return False

pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]")

if isinstance(texts, str):
texts = list(texts)
elif isinstance(texts, list):
texts = [t for t in texts if isinstance(t, str) and t.strip()]
else:
return False

if not texts:
return False

eng = sum(1 for t in texts if pattern.fullmatch(t.strip()))
return (eng / len(texts)) > 0.8


def is_chinese(text):

Laden…
Abbrechen
Speichern