Co-authored-by: JzoNg <jzongcode@gmail.com>tags/1.3.0
| @@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter): | |||
| else: | |||
| return [GPT2Tokenizer.get_num_tokens(text) for text in texts] | |||
| def _character_encoder(texts: list[str]) -> list[int]: | |||
| if not texts: | |||
| return [] | |||
| return [len(text) for text in texts] | |||
| if issubclass(cls, TokenTextSplitter): | |||
| extra_kwargs = { | |||
| "model_name": embedding_model_instance.model if embedding_model_instance else "gpt2", | |||
| @@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter): | |||
| } | |||
| kwargs = {**kwargs, **extra_kwargs} | |||
| return cls(length_function=_token_encoder, **kwargs) | |||
| return cls(length_function=_character_encoder, **kwargs) | |||
| class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter): | |||
| @@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) | |||
| _good_splits_lengths = [] # cache the lengths of the splits | |||
| _separator = "" if self._keep_separator else separator | |||
| s_lens = self._length_function(splits) | |||
| if _separator != "": | |||
| if separator != "": | |||
| for s, s_len in zip(splits, s_lens): | |||
| if s_len < self._chunk_size: | |||
| _good_splits.append(s) | |||
| @@ -553,7 +553,7 @@ class DocumentService: | |||
| {"id": "remove_extra_spaces", "enabled": True}, | |||
| {"id": "remove_urls_emails", "enabled": False}, | |||
| ], | |||
| "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50}, | |||
| "segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50}, | |||
| }, | |||
| "limits": { | |||
| "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH, | |||
| @@ -97,7 +97,7 @@ export enum IndexingType { | |||
| } | |||
| const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' | |||
| const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 | |||
| const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 | |||
| const DEFAULT_OVERLAP = 50 | |||
| const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) | |||
| @@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = { | |||
| chunkForContext: 'paragraph', | |||
| parent: { | |||
| delimiter: '\\n\\n', | |||
| maxLength: 500, | |||
| maxLength: 1024, | |||
| }, | |||
| child: { | |||
| delimiter: '\\n', | |||
| maxLength: 200, | |||
| maxLength: 512, | |||
| }, | |||
| } | |||
| @@ -623,12 +623,12 @@ const StepTwo = ({ | |||
| onChange={e => setSegmentIdentifier(e.target.value, true)} | |||
| /> | |||
| <MaxLengthInput | |||
| unit='tokens' | |||
| unit='characters' | |||
| value={maxChunkLength} | |||
| onChange={setMaxChunkLength} | |||
| /> | |||
| <OverlapInput | |||
| unit='tokens' | |||
| unit='characters' | |||
| value={overlap} | |||
| min={1} | |||
| onChange={setOverlap} | |||
| @@ -756,7 +756,7 @@ const StepTwo = ({ | |||
| })} | |||
| /> | |||
| <MaxLengthInput | |||
| unit='tokens' | |||
| unit='characters' | |||
| value={parentChildConfig.parent.maxLength} | |||
| onChange={value => setParentChildConfig({ | |||
| ...parentChildConfig, | |||
| @@ -803,7 +803,7 @@ const StepTwo = ({ | |||
| })} | |||
| /> | |||
| <MaxLengthInput | |||
| unit='tokens' | |||
| unit='characters' | |||
| value={parentChildConfig.child.maxLength} | |||
| onChange={value => setParentChildConfig({ | |||
| ...parentChildConfig, | |||