Co-authored-by: JzoNg <jzongcode@gmail.com>

6 months ago · 95283b4dd3
--- a/api/core/rag/splitter/fixed_text_splitter.py
+++ b/api/core/rag/splitter/fixed_text_splitter.py
@@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
            else:
                return [GPT2Tokenizer.get_num_tokens(text) for text in texts]

        def _character_encoder(texts: list[str]) -> list[int]:
            if not texts:
                return []

            return [len(text) for text in texts]

        if issubclass(cls, TokenTextSplitter):
            extra_kwargs = {
                "model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
@@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
            }
            kwargs = {**kwargs, **extra_kwargs}

        return cls(length_function=_token_encoder, **kwargs)
        return cls(length_function=_character_encoder, **kwargs)


 class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
@@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
        _good_splits_lengths = []  # cache the lengths of the splits
        _separator = "" if self._keep_separator else separator
        s_lens = self._length_function(splits)
        if _separator != "":
        if separator != "":
            for s, s_len in zip(splits, s_lens):
                if s_len < self._chunk_size:
                    _good_splits.append(s)
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -553,7 +553,7 @@ class DocumentService:
                {"id": "remove_extra_spaces", "enabled": True},
                {"id": "remove_urls_emails", "enabled": False},
            ],
            "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
            "segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50},
        },
        "limits": {
            "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
--- a/web/app/components/datasets/create/step-two/index.tsx
+++ b/web/app/components/datasets/create/step-two/index.tsx
@@ -97,7 +97,7 @@ export enum IndexingType {
 }

 const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
 const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
 const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
 const DEFAULT_OVERLAP = 50
 const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)

@@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = {
  chunkForContext: 'paragraph',
  parent: {
    delimiter: '\\n\\n',
    maxLength: 500,
    maxLength: 1024,
  },
  child: {
    delimiter: '\\n',
    maxLength: 200,
    maxLength: 512,
  },
 }

@@ -623,12 +623,12 @@ const StepTwo = ({
                  onChange={e => setSegmentIdentifier(e.target.value, true)}
                />
                <MaxLengthInput
                  unit='tokens'
                  unit='characters'
                  value={maxChunkLength}
                  onChange={setMaxChunkLength}
                />
                <OverlapInput
                  unit='tokens'
                  unit='characters'
                  value={overlap}
                  min={1}
                  onChange={setOverlap}
@@ -756,7 +756,7 @@ const StepTwo = ({
                        })}
                      />
                      <MaxLengthInput
                        unit='tokens'
                        unit='characters'
                        value={parentChildConfig.parent.maxLength}
                        onChange={value => setParentChildConfig({
                          ...parentChildConfig,
@@ -803,7 +803,7 @@ const StepTwo = ({
                    })}
                  />
                  <MaxLengthInput
                    unit='tokens'
                    unit='characters'
                    value={parentChildConfig.child.maxLength}
                    onChange={value => setParentChildConfig({
                      ...parentChildConfig,