浏览代码

fix: keep image url (#17430)

tags/1.2.0
Panpan 7 个月前
父节点
当前提交
fc3f14c0ee
没有帐户链接到提交者的电子邮件
共有 1 个文件被更改,包括 20 次插入3 次删除
  1. 20
    3
      api/core/rag/cleaner/clean_processor.py

+ 20
- 3
api/core/rag/cleaner/clean_processor.py 查看文件

@@ -27,9 +27,26 @@ class CleanProcessor:
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
text = re.sub(pattern, "", text)

# Remove URL
pattern = r"https?://[^\s]+"
text = re.sub(pattern, "", text)
# Remove URL but keep Markdown image URLs
# First, temporarily replace Markdown image URLs with a placeholder
markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)"
placeholders: list[str] = []

def replace_with_placeholder(match, placeholders=placeholders):
url = match.group(1)
placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__"
placeholders.append(url)
return f"![image]({placeholder})"

text = re.sub(markdown_image_pattern, replace_with_placeholder, text)

# Now remove all remaining URLs
url_pattern = r"https?://[^\s)]+"
text = re.sub(url_pattern, "", text)

# Finally, restore the Markdown image URLs
for i, url in enumerate(placeholders):
text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url)
return text

def filter_string(self, text):

正在加载...
取消
保存