| @@ -30,7 +30,7 @@ from core.rag.splitter.fixed_text_splitter import ( | |||
| FixedRecursiveCharacterTextSplitter, | |||
| ) | |||
| from core.rag.splitter.text_splitter import TextSplitter | |||
| from core.tools.utils.rag_web_reader import get_image_upload_file_ids | |||
| from core.tools.utils.web_reader_tool import get_image_upload_file_ids | |||
| from extensions.ext_database import db | |||
| from extensions.ext_redis import redis_client | |||
| from extensions.ext_storage import storage | |||
| @@ -1,17 +0,0 @@ | |||
| import re | |||
| def get_image_upload_file_ids(content): | |||
| pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" | |||
| matches = re.findall(pattern, content) | |||
| image_upload_file_ids = [] | |||
| for match in matches: | |||
| if match[1] == "file-preview": | |||
| content_pattern = r"files/([^/]+)/file-preview" | |||
| else: | |||
| content_pattern = r"files/([^/]+)/image-preview" | |||
| content_match = re.search(content_pattern, match[0]) | |||
| if content_match: | |||
| image_upload_file_id = content_match.group(1) | |||
| image_upload_file_ids.append(image_upload_file_id) | |||
| return image_upload_file_ids | |||
| @@ -80,14 +80,14 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str: | |||
| else: | |||
| content = response.text | |||
| article = extract_using_readabilipy(content) | |||
| article = extract_using_readability(content) | |||
| if not article.text: | |||
| return "" | |||
| res = FULL_TEMPLATE.format( | |||
| title=article.title, | |||
| author=article.auther, | |||
| author=article.author, | |||
| text=article.text, | |||
| ) | |||
| @@ -97,15 +97,15 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str: | |||
| @dataclass | |||
| class Article: | |||
| title: str | |||
| auther: str | |||
| author: str | |||
| text: Sequence[dict] | |||
| def extract_using_readabilipy(html: str): | |||
| def extract_using_readability(html: str): | |||
| json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True) | |||
| article = Article( | |||
| title=json_article.get("title") or "", | |||
| auther=json_article.get("byline") or "", | |||
| author=json_article.get("byline") or "", | |||
| text=json_article.get("plain_text") or [], | |||
| ) | |||
| @@ -113,7 +113,7 @@ def extract_using_readabilipy(html: str): | |||
| def get_image_upload_file_ids(content): | |||
| pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" | |||
| pattern = r"!\[image\]\((https?://.*?(file-preview|image-preview))\)" | |||
| matches = re.findall(pattern, content) | |||
| image_upload_file_ids = [] | |||
| for match in matches: | |||
| @@ -5,7 +5,7 @@ import click | |||
| from celery import shared_task # type: ignore | |||
| from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | |||
| from core.tools.utils.rag_web_reader import get_image_upload_file_ids | |||
| from core.tools.utils.web_reader_tool import get_image_upload_file_ids | |||
| from extensions.ext_database import db | |||
| from extensions.ext_storage import storage | |||
| from models.dataset import ( | |||
| @@ -6,7 +6,7 @@ import click | |||
| from celery import shared_task # type: ignore | |||
| from core.rag.index_processor.index_processor_factory import IndexProcessorFactory | |||
| from core.tools.utils.rag_web_reader import get_image_upload_file_ids | |||
| from core.tools.utils.web_reader_tool import get_image_upload_file_ids | |||
| from extensions.ext_database import db | |||
| from extensions.ext_storage import storage | |||
| from models.dataset import Dataset, DatasetMetadataBinding, DocumentSegment | |||
| @@ -0,0 +1,25 @@ | |||
| from core.tools.utils.web_reader_tool import get_image_upload_file_ids | |||
| def test_get_image_upload_file_ids(): | |||
| # should extract id from https + file-preview | |||
| content = "" | |||
| assert get_image_upload_file_ids(content) == ["abc123"] | |||
| # should extract id from http + image-preview | |||
| content = "" | |||
| assert get_image_upload_file_ids(content) == ["xyz789"] | |||
| # should not match invalid scheme 'htt://' | |||
| content = "" | |||
| assert get_image_upload_file_ids(content) == [] | |||
| # should extract multiple ids in order | |||
| content = """ | |||
| some text | |||
|  | |||
| middle | |||
|  | |||
| end | |||
| """ | |||
| assert get_image_upload_file_ids(content) == ["id1", "id2"] | |||