|
|
|
@@ -14,15 +14,17 @@ class XinferenceModelExtraParameter(object): |
|
|
|
model_handle_type: str |
|
|
|
model_ability: List[str] |
|
|
|
max_tokens: int = 512 |
|
|
|
context_length: int = 2048 |
|
|
|
support_function_call: bool = False |
|
|
|
|
|
|
|
def __init__(self, model_format: str, model_handle_type: str, model_ability: List[str], |
|
|
|
support_function_call: bool, max_tokens: int) -> None: |
|
|
|
support_function_call: bool, max_tokens: int, context_length: int) -> None: |
|
|
|
self.model_format = model_format |
|
|
|
self.model_handle_type = model_handle_type |
|
|
|
self.model_ability = model_ability |
|
|
|
self.support_function_call = support_function_call |
|
|
|
self.max_tokens = max_tokens |
|
|
|
self.context_length = context_length |
|
|
|
|
|
|
|
cache = {} |
|
|
|
cache_lock = Lock() |
|
|
|
@@ -57,7 +59,7 @@ class XinferenceHelper: |
|
|
|
|
|
|
|
url = path.join(server_url, 'v1/models', model_uid) |
|
|
|
|
|
|
|
# this methid is surrounded by a lock, and default requests may hang forever, so we just set a Adapter with max_retries=3 |
|
|
|
# this method is surrounded by a lock, and default requests may hang forever, so we just set a Adapter with max_retries=3 |
|
|
|
session = Session() |
|
|
|
session.mount('http://', HTTPAdapter(max_retries=3)) |
|
|
|
session.mount('https://', HTTPAdapter(max_retries=3)) |
|
|
|
@@ -88,11 +90,14 @@ class XinferenceHelper: |
|
|
|
|
|
|
|
support_function_call = 'tools' in model_ability |
|
|
|
max_tokens = response_json.get('max_tokens', 512) |
|
|
|
|
|
|
|
context_length = response_json.get('context_length', 2048) |
|
|
|
|
|
|
|
return XinferenceModelExtraParameter( |
|
|
|
model_format=model_format, |
|
|
|
model_handle_type=model_handle_type, |
|
|
|
model_ability=model_ability, |
|
|
|
support_function_call=support_function_call, |
|
|
|
max_tokens=max_tokens |
|
|
|
max_tokens=max_tokens, |
|
|
|
context_length=context_length |
|
|
|
) |