| @@ -176,3 +176,6 @@ HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576 # 1MB | |||
| # Log file path | |||
| LOG_FILE= | |||
| # Indexing configuration | |||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000 | |||
| @@ -79,6 +79,7 @@ DEFAULTS = { | |||
| 'KEYWORD_DATA_SOURCE_TYPE': 'database', | |||
| 'INNER_API': 'False', | |||
| 'ENTERPRISE_ENABLED': 'False', | |||
| 'INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH': 1000, | |||
| } | |||
| @@ -379,3 +380,8 @@ class Config: | |||
| self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE') | |||
| self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED') | |||
| # ------------------------ | |||
| # Indexing Configurations. | |||
| # ------------------------ | |||
| self.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH = get_env('INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH') | |||
| @@ -411,8 +411,9 @@ class IndexingRunner: | |||
| # The user-defined segmentation rule | |||
| rules = json.loads(processing_rule.rules) | |||
| segmentation = rules["segmentation"] | |||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: | |||
| raise ValueError("Custom segment length should be between 50 and 1000.") | |||
| max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) | |||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: | |||
| raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") | |||
| separator = segmentation["separator"] | |||
| if separator: | |||
| @@ -2,6 +2,8 @@ | |||
| from abc import ABC, abstractmethod | |||
| from typing import Optional | |||
| from flask import current_app | |||
| from core.model_manager import ModelInstance | |||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | |||
| from core.rag.models.document import Document | |||
| @@ -43,8 +45,9 @@ class BaseIndexProcessor(ABC): | |||
| # The user-defined segmentation rule | |||
| rules = processing_rule['rules'] | |||
| segmentation = rules["segmentation"] | |||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: | |||
| raise ValueError("Custom segment length should be between 50 and 1000.") | |||
| max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) | |||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: | |||
| raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") | |||
| separator = segmentation["separator"] | |||
| if separator: | |||
| @@ -167,6 +167,8 @@ services: | |||
| # SSRF Proxy server | |||
| SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128' | |||
| SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128' | |||
| # Indexing configuration | |||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 | |||
| depends_on: | |||
| - db | |||
| - redis | |||
| @@ -287,6 +289,8 @@ services: | |||
| NOTION_CLIENT_SECRET: you-client-secret | |||
| NOTION_CLIENT_ID: you-client-id | |||
| NOTION_INTERNAL_SECRET: you-internal-secret | |||
| # Indexing configuration | |||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 | |||
| depends_on: | |||
| - db | |||
| - redis | |||