Ver código fonte

Fix: correctly match http/https URLs in image upload file (#24180)

tags/1.8.0
Yongtao Huang 2 meses atrás
pai
commit
6b5c2bea4d
Nenhuma conta vinculada ao e-mail do autor do commit

+ 1
- 1
api/core/indexing_runner.py Ver arquivo

FixedRecursiveCharacterTextSplitter, FixedRecursiveCharacterTextSplitter,
) )
from core.rag.splitter.text_splitter import TextSplitter from core.rag.splitter.text_splitter import TextSplitter
from core.tools.utils.rag_web_reader import get_image_upload_file_ids
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from extensions.ext_storage import storage from extensions.ext_storage import storage

+ 0
- 17
api/core/tools/utils/rag_web_reader.py Ver arquivo

import re


def get_image_upload_file_ids(content):
pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)"
matches = re.findall(pattern, content)
image_upload_file_ids = []
for match in matches:
if match[1] == "file-preview":
content_pattern = r"files/([^/]+)/file-preview"
else:
content_pattern = r"files/([^/]+)/image-preview"
content_match = re.search(content_pattern, match[0])
if content_match:
image_upload_file_id = content_match.group(1)
image_upload_file_ids.append(image_upload_file_id)
return image_upload_file_ids

+ 6
- 6
api/core/tools/utils/web_reader_tool.py Ver arquivo

else: else:
content = response.text content = response.text


article = extract_using_readabilipy(content)
article = extract_using_readability(content)


if not article.text: if not article.text:
return "" return ""


res = FULL_TEMPLATE.format( res = FULL_TEMPLATE.format(
title=article.title, title=article.title,
author=article.auther,
author=article.author,
text=article.text, text=article.text,
) )


@dataclass @dataclass
class Article: class Article:
title: str title: str
auther: str
author: str
text: Sequence[dict] text: Sequence[dict]




def extract_using_readabilipy(html: str):
def extract_using_readability(html: str):
json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True) json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True)
article = Article( article = Article(
title=json_article.get("title") or "", title=json_article.get("title") or "",
auther=json_article.get("byline") or "",
author=json_article.get("byline") or "",
text=json_article.get("plain_text") or [], text=json_article.get("plain_text") or [],
) )






def get_image_upload_file_ids(content): def get_image_upload_file_ids(content):
pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)"
pattern = r"!\[image\]\((https?://.*?(file-preview|image-preview))\)"
matches = re.findall(pattern, content) matches = re.findall(pattern, content)
image_upload_file_ids = [] image_upload_file_ids = []
for match in matches: for match in matches:

+ 1
- 1
api/tasks/clean_dataset_task.py Ver arquivo

from celery import shared_task # type: ignore from celery import shared_task # type: ignore


from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.tools.utils.rag_web_reader import get_image_upload_file_ids
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.dataset import ( from models.dataset import (

+ 1
- 1
api/tasks/clean_document_task.py Ver arquivo

from celery import shared_task # type: ignore from celery import shared_task # type: ignore


from core.rag.index_processor.index_processor_factory import IndexProcessorFactory from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from core.tools.utils.rag_web_reader import get_image_upload_file_ids
from core.tools.utils.web_reader_tool import get_image_upload_file_ids
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.dataset import Dataset, DatasetMetadataBinding, DocumentSegment from models.dataset import Dataset, DatasetMetadataBinding, DocumentSegment

+ 25
- 0
api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py Ver arquivo

from core.tools.utils.web_reader_tool import get_image_upload_file_ids


def test_get_image_upload_file_ids():
# should extract id from https + file-preview
content = "![image](https://example.com/a/b/files/abc123/file-preview)"
assert get_image_upload_file_ids(content) == ["abc123"]

# should extract id from http + image-preview
content = "![image](http://host/files/xyz789/image-preview)"
assert get_image_upload_file_ids(content) == ["xyz789"]

# should not match invalid scheme 'htt://'
content = "![image](htt://host/files/bad/file-preview)"
assert get_image_upload_file_ids(content) == []

# should extract multiple ids in order
content = """
some text
![image](https://h/files/id1/file-preview)
middle
![image](http://h/files/id2/image-preview)
end
"""
assert get_image_upload_file_ids(content) == ["id1", "id2"]

Carregando…
Cancelar
Salvar