| # Log file path | # Log file path | ||||
| LOG_FILE= | LOG_FILE= | ||||
| # Indexing configuration | |||||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000 |
| 'KEYWORD_DATA_SOURCE_TYPE': 'database', | 'KEYWORD_DATA_SOURCE_TYPE': 'database', | ||||
| 'INNER_API': 'False', | 'INNER_API': 'False', | ||||
| 'ENTERPRISE_ENABLED': 'False', | 'ENTERPRISE_ENABLED': 'False', | ||||
| 'INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH': 1000, | |||||
| } | } | ||||
| self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE') | self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE') | ||||
| self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED') | self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED') | ||||
| # ------------------------ | |||||
| # Indexing Configurations. | |||||
| # ------------------------ | |||||
| self.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH = get_env('INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH') |
| # The user-defined segmentation rule | # The user-defined segmentation rule | ||||
| rules = json.loads(processing_rule.rules) | rules = json.loads(processing_rule.rules) | ||||
| segmentation = rules["segmentation"] | segmentation = rules["segmentation"] | ||||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: | |||||
| raise ValueError("Custom segment length should be between 50 and 1000.") | |||||
| max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) | |||||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: | |||||
| raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") | |||||
| separator = segmentation["separator"] | separator = segmentation["separator"] | ||||
| if separator: | if separator: |
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||
| from typing import Optional | from typing import Optional | ||||
| from flask import current_app | |||||
| from core.model_manager import ModelInstance | from core.model_manager import ModelInstance | ||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| # The user-defined segmentation rule | # The user-defined segmentation rule | ||||
| rules = processing_rule['rules'] | rules = processing_rule['rules'] | ||||
| segmentation = rules["segmentation"] | segmentation = rules["segmentation"] | ||||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: | |||||
| raise ValueError("Custom segment length should be between 50 and 1000.") | |||||
| max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) | |||||
| if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: | |||||
| raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") | |||||
| separator = segmentation["separator"] | separator = segmentation["separator"] | ||||
| if separator: | if separator: |
| # SSRF Proxy server | # SSRF Proxy server | ||||
| SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128' | SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128' | ||||
| SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128' | SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128' | ||||
| # Indexing configuration | |||||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 | |||||
| depends_on: | depends_on: | ||||
| - db | - db | ||||
| - redis | - redis | ||||
| NOTION_CLIENT_SECRET: you-client-secret | NOTION_CLIENT_SECRET: you-client-secret | ||||
| NOTION_CLIENT_ID: you-client-id | NOTION_CLIENT_ID: you-client-id | ||||
| NOTION_INTERNAL_SECRET: you-internal-secret | NOTION_INTERNAL_SECRET: you-internal-secret | ||||
| # Indexing configuration | |||||
| INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 | |||||
| depends_on: | depends_on: | ||||
| - db | - db | ||||
| - redis | - redis |