Przeglądaj źródła

allow to config max segmentation tokens length for RAG document using environment variable (#4375)

tags/0.6.9
Rain Chen 1 rok temu
rodzic
commit
c255a20d7c
No account linked to committer's email address

+ 3
- 0
api/.env.example Wyświetl plik



# Log file path # Log file path
LOG_FILE= LOG_FILE=

# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000

+ 6
- 0
api/config.py Wyświetl plik

'KEYWORD_DATA_SOURCE_TYPE': 'database', 'KEYWORD_DATA_SOURCE_TYPE': 'database',
'INNER_API': 'False', 'INNER_API': 'False',
'ENTERPRISE_ENABLED': 'False', 'ENTERPRISE_ENABLED': 'False',
'INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH': 1000,
} }






self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE') self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE')
self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED') self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED')

# ------------------------
# Indexing Configurations.
# ------------------------
self.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH = get_env('INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH')

+ 3
- 2
api/core/indexing_runner.py Wyświetl plik

# The user-defined segmentation rule # The user-defined segmentation rule
rules = json.loads(processing_rule.rules) rules = json.loads(processing_rule.rules)
segmentation = rules["segmentation"] segmentation = rules["segmentation"]
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000:
raise ValueError("Custom segment length should be between 50 and 1000.")
max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH'])
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")


separator = segmentation["separator"] separator = segmentation["separator"]
if separator: if separator:

+ 5
- 2
api/core/rag/index_processor/index_processor_base.py Wyświetl plik

from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional from typing import Optional


from flask import current_app

from core.model_manager import ModelInstance from core.model_manager import ModelInstance
from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.models.document import Document from core.rag.models.document import Document
# The user-defined segmentation rule # The user-defined segmentation rule
rules = processing_rule['rules'] rules = processing_rule['rules']
segmentation = rules["segmentation"] segmentation = rules["segmentation"]
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000:
raise ValueError("Custom segment length should be between 50 and 1000.")
max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH'])
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")


separator = segmentation["separator"] separator = segmentation["separator"]
if separator: if separator:

+ 4
- 0
docker/docker-compose.yaml Wyświetl plik

# SSRF Proxy server # SSRF Proxy server
SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128' SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128'
SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128' SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128'
# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000
depends_on: depends_on:
- db - db
- redis - redis
NOTION_CLIENT_SECRET: you-client-secret NOTION_CLIENT_SECRET: you-client-secret
NOTION_CLIENT_ID: you-client-id NOTION_CLIENT_ID: you-client-id
NOTION_INTERNAL_SECRET: you-internal-secret NOTION_INTERNAL_SECRET: you-internal-secret
# Indexing configuration
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000
depends_on: depends_on:
- db - db
- redis - redis

Ładowanie…
Anuluj
Zapisz