Browse Source

fix: split chunks return empty strings (#2197)

tags/0.5.1
takatost 1 year ago
parent
commit
6cf93379b3
No account linked to committer's email address

+ 3
- 1
api/core/indexing_runner.py View File

else: else:
page_content = page_content page_content = page_content
document_node.page_content = page_content document_node.page_content = page_content
split_documents.append(document_node)

if document_node.page_content:
split_documents.append(document_node)
all_documents.extend(split_documents) all_documents.extend(split_documents)
# processing qa document # processing qa document
if document_form == 'qa_model': if document_form == 'qa_model':

+ 3
- 3
api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py View File

import base64 import base64
import copy import copy
import time import time
from typing import Optional, Tuple
from typing import Optional, Tuple, Union


import numpy as np import numpy as np
import tiktoken import tiktoken
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""],
texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )


return ai_model_entity.entity return ai_model_entity.entity


@staticmethod @staticmethod
def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str],
def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
response = client.embeddings.create( response = client.embeddings.create(
input=texts, input=texts,

+ 4
- 1
api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py View File

embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
credentials=credentials, credentials=credentials,
texts=[""]
texts=[" "]
) )


used_tokens += embedding_used_tokens used_tokens += embedding_used_tokens
:param text: text to tokenize :param text: text to tokenize
:return: :return:
""" """
if not text:
return Tokens([], [], {})

# initialize client # initialize client
client = cohere.Client(credentials.get('api_key')) client = cohere.Client(credentials.get('api_key'))



+ 3
- 3
api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py View File

import base64 import base64
import time import time
from typing import Optional, Tuple
from typing import Optional, Tuple, Union


import numpy as np import numpy as np
import tiktoken import tiktoken
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""],
texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )


except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))


def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str],
def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
""" """
Invoke embedding model Invoke embedding model

Loading…
Cancel
Save