| from typing import Any | |||||
| from core.helper import ssrf_proxy | |||||
| from core.tools.entities.tool_entities import ToolInvokeMessage | |||||
| from core.tools.tool.builtin_tool import BuiltinTool | |||||
| class JinaTokenizerTool(BuiltinTool): | |||||
| _jina_tokenizer_endpoint = 'https://tokenize.jina.ai/' | |||||
| def _invoke( | |||||
| self, | |||||
| user_id: str, | |||||
| tool_parameters: dict[str, Any], | |||||
| ) -> ToolInvokeMessage: | |||||
| content = tool_parameters['content'] | |||||
| body = { | |||||
| "content": content | |||||
| } | |||||
| headers = { | |||||
| 'Content-Type': 'application/json' | |||||
| } | |||||
| if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'): | |||||
| headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key') | |||||
| if tool_parameters.get('return_chunks', False): | |||||
| body['return_chunks'] = True | |||||
| if tool_parameters.get('return_tokens', False): | |||||
| body['return_tokens'] = True | |||||
| if tokenizer := tool_parameters.get('tokenizer'): | |||||
| body['tokenizer'] = tokenizer | |||||
| response = ssrf_proxy.post( | |||||
| self._jina_tokenizer_endpoint, | |||||
| headers=headers, | |||||
| json=body, | |||||
| ) | |||||
| return self.create_json_message(response.json()) | 
| identity: | |||||
| name: jina_tokenizer | |||||
| author: hjlarry | |||||
| label: | |||||
| en_US: JinaTokenizer | |||||
| description: | |||||
| human: | |||||
| en_US: Free API to tokenize text and segment long text into chunks. | |||||
| zh_Hans: 免费的API可以将文本tokenize,也可以将长文本分割成多个部分。 | |||||
| llm: Free API to tokenize text and segment long text into chunks. | |||||
| parameters: | |||||
| - name: content | |||||
| type: string | |||||
| required: true | |||||
| label: | |||||
| en_US: Content | |||||
| zh_Hans: 内容 | |||||
| llm_description: the content which need to tokenize or segment | |||||
| form: llm | |||||
| - name: return_tokens | |||||
| type: boolean | |||||
| required: false | |||||
| label: | |||||
| en_US: Return the tokens | |||||
| zh_Hans: 是否返回tokens | |||||
| human_description: | |||||
| en_US: Return the tokens and their corresponding ids in the response. | |||||
| zh_Hans: 返回tokens及其对应的ids。 | |||||
| form: form | |||||
| - name: return_chunks | |||||
| type: boolean | |||||
| label: | |||||
| en_US: Return the chunks | |||||
| zh_Hans: 是否分块 | |||||
| human_description: | |||||
| en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues. | |||||
| zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。 | |||||
| form: form | |||||
| - name: tokenizer | |||||
| type: select | |||||
| options: | |||||
| - value: cl100k_base | |||||
| label: | |||||
| en_US: cl100k_base | |||||
| - value: o200k_base | |||||
| label: | |||||
| en_US: o200k_base | |||||
| - value: p50k_base | |||||
| label: | |||||
| en_US: p50k_base | |||||
| - value: r50k_base | |||||
| label: | |||||
| en_US: r50k_base | |||||
| - value: p50k_edit | |||||
| label: | |||||
| en_US: p50k_edit | |||||
| - value: gpt2 | |||||
| label: | |||||
| en_US: gpt2 | |||||
| label: | |||||
| en_US: Tokenizer | |||||
| human_description: | |||||
| en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002 | |||||
| form: form |