| @@ -0,0 +1,43 @@ | |||
| from typing import Any | |||
| from core.helper import ssrf_proxy | |||
| from core.tools.entities.tool_entities import ToolInvokeMessage | |||
| from core.tools.tool.builtin_tool import BuiltinTool | |||
| class JinaTokenizerTool(BuiltinTool): | |||
| _jina_tokenizer_endpoint = 'https://tokenize.jina.ai/' | |||
| def _invoke( | |||
| self, | |||
| user_id: str, | |||
| tool_parameters: dict[str, Any], | |||
| ) -> ToolInvokeMessage: | |||
| content = tool_parameters['content'] | |||
| body = { | |||
| "content": content | |||
| } | |||
| headers = { | |||
| 'Content-Type': 'application/json' | |||
| } | |||
| if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'): | |||
| headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key') | |||
| if tool_parameters.get('return_chunks', False): | |||
| body['return_chunks'] = True | |||
| if tool_parameters.get('return_tokens', False): | |||
| body['return_tokens'] = True | |||
| if tokenizer := tool_parameters.get('tokenizer'): | |||
| body['tokenizer'] = tokenizer | |||
| response = ssrf_proxy.post( | |||
| self._jina_tokenizer_endpoint, | |||
| headers=headers, | |||
| json=body, | |||
| ) | |||
| return self.create_json_message(response.json()) | |||
| @@ -0,0 +1,64 @@ | |||
| identity: | |||
| name: jina_tokenizer | |||
| author: hjlarry | |||
| label: | |||
| en_US: JinaTokenizer | |||
| description: | |||
| human: | |||
| en_US: Free API to tokenize text and segment long text into chunks. | |||
| zh_Hans: 免费的API可以将文本tokenize,也可以将长文本分割成多个部分。 | |||
| llm: Free API to tokenize text and segment long text into chunks. | |||
| parameters: | |||
| - name: content | |||
| type: string | |||
| required: true | |||
| label: | |||
| en_US: Content | |||
| zh_Hans: 内容 | |||
| llm_description: the content which need to tokenize or segment | |||
| form: llm | |||
| - name: return_tokens | |||
| type: boolean | |||
| required: false | |||
| label: | |||
| en_US: Return the tokens | |||
| zh_Hans: 是否返回tokens | |||
| human_description: | |||
| en_US: Return the tokens and their corresponding ids in the response. | |||
| zh_Hans: 返回tokens及其对应的ids。 | |||
| form: form | |||
| - name: return_chunks | |||
| type: boolean | |||
| label: | |||
| en_US: Return the chunks | |||
| zh_Hans: 是否分块 | |||
| human_description: | |||
| en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues. | |||
| zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。 | |||
| form: form | |||
| - name: tokenizer | |||
| type: select | |||
| options: | |||
| - value: cl100k_base | |||
| label: | |||
| en_US: cl100k_base | |||
| - value: o200k_base | |||
| label: | |||
| en_US: o200k_base | |||
| - value: p50k_base | |||
| label: | |||
| en_US: p50k_base | |||
| - value: r50k_base | |||
| label: | |||
| en_US: r50k_base | |||
| - value: p50k_edit | |||
| label: | |||
| en_US: p50k_edit | |||
| - value: gpt2 | |||
| label: | |||
| en_US: gpt2 | |||
| label: | |||
| en_US: Tokenizer | |||
| human_description: | |||
| en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002 | |||
| form: form | |||