10000 [feature] litellm provider (#19) · anaconda/anaconda-ai@2327dee · GitHub
[go: up one dir, main page]

Skip to content

Commit 2327dee

Browse files
[feature] litellm provider (#19)
* litellm provider completions and streaming * types * cast the dict * async all of this is openai-like * handle kwargs * readme * more types * any * try again * oh, the client kwarg * mypy issues with async iterators python/mypy#12662 * ugh! * wrong place
1 parent 0d27710 commit 2327dee

File tree

3 files changed

+225
-1
lines changed

3 files changed

+225
-1
lines changed

README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ Below you will find documentation for
1414
* [Integration with LLM CLI](#llm)
1515
* [Anaconda Model Cache SDK](#sdk)
1616
* [Langchain](#langchain)
17+
* [LlamaIndex](#llamaindex)
18+
* [LiteLLM](#litellm)
19+
* [DSPy](#dspy)
1720
* [PandasAI](#pandasai)
1821
* [Panel ChatInterface](#panel)
1922
* [Appendix: model download path](#download-path)
@@ -303,6 +306,46 @@ The `AnacondaModel` class supports the following arguments
303306
* `temperature`: Optional temperature to apply to all completions and chats (default is 0.1)
304307
* `max_tokens`: Optional Max tokens to predict (default is to let the model decide when to finish)
305308

309+
## LiteLLM
310+
311+
This provides a CustomLLM provider for use with `litellm`. But, since litellm does not currently support entrypoints to register the provider, the user must import the module first.
312+
313+
```python
314+
import litellm
315+
import anaconda_models.litellm
316+
317+
response = litellm.completion(
318+
'anaconda/openhermes-2.5-mistral-7b/q4_k_m',
319+
messages=[{'role': 'user', 'content': 'what is pi?'}]
320+
)
321+
```
322+
323+
Supported usage:
324+
325+
* completion (with and without stream=True)
326+
* acompletion (with and without stream=True)
327+
* Most OpenAI [inference parameters](https://docs.litellm.ai/docs/completion/input)
328+
* `n`: number of completions is not supported
329+
* llama.cpp server options are passed as a dictionary called `llama_cpp_kwargs` (see above)
330+
331+
## DSPy
332+
333+
Since DSPy uses LiteLLM, Anaconda models can be used with dspy.
334+
Streaming and async are supported for raw LLM calls and for modules
335+
like Predict or ChainofThought
336+
.
337+
338+
```python
339+
import dspy
340+
import anaconda_models.litellm
341+
342+
lm = dspy.LM('anaconda/openhermes-2.5-mistral-7b/q4_k_m')
343+
dspy.configure(lm=lm)
344+
345+
chai = dspy.ChainOfThought("question -> answer")
346+
chain(question="Who are you?")
347+
```
348+
306349
## PandasAI
307350

308351
[PandasAI](https://github.com/Sinaptik-AI/pandas-ai): chat with data

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ anaconda = "anaconda_models.llm"
2929

3030
[project.optional-dependencies]
3131
all = [
32-
"anaconda-models[langchain,llm,llama-index,panel]"
32+
"anaconda-models[langchain,litellm,llm,llama-index,panel]"
3333
]
3434
dev = [
3535
"mypy",
@@ -40,6 +40,7 @@ dev = [
4040
"types-requests"
4141
]
4242
langchain = ["langchain-openai>=0.2.8"]
43+
litellm = ["litellm"]
4344
llama-index = ["llama-index-llms-openai"]
4445
llm = ["llm", "click"]
4546
pandasai = ["pandasai>=2.4"]

src/anaconda_models/litellm.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
from typing import Callable, Iterator, Optional, Any, Union, cast, AsyncIterator
2+
3+
import litellm
4+
from httpx import Timeout
5+
from litellm.llms.custom_httpx.http_handler import HTTPHandler, AsyncHTTPHandler
6+
from litellm.llms.custom_llm import CustomLLM
7+
from litellm.types.utils import ModelResponse, GenericStreamingChunk
8+
from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
9+
10+
from anaconda_models.core import (
11+
AnacondaQuantizedModelCache,
12+
AnacondaQuantizedModelService,
13+
)
14+
15+
16+
class AnacondaLLM(CustomLLM):
17+
_model: Optional[AnacondaQuantizedModelCache] = None
18+
_service: Optional[AnacondaQuantizedModelService] = None
19+
20+
def _prepare_inference_kwargs(self, optional_params: dict) -> dict:
21+
inference_kwargs = optional_params.copy()
22+
_ = inference_kwargs.pop("stream", None)
23+
_ = inference_kwargs.pop("stream_options", None)
24+
return inference_kwargs
25+
26+
def completion(
27+
self,
28+
model: str,
29+
messages: list,
30+
api_base: str,
31+
custom_prompt_dict: dict,
32+
model_response: ModelResponse,
33+
print_verbose: Callable,
34+
encoding: Any,
35+
api_key: Any,
36+
logging_obj: Any,
37+
optional_params: dict,
38+
acompletion: Optional[AsyncHTTPHandler] = None,
39+
litellm_params: Optional[Any] = None,
40+
logger_fn: Optional[Any] = None,
41+
headers: Optional[dict] = None,
42+
timeout: Optional[Union[float, Timeout]] = None,
43+
client: Optional[HTTPHandler] = None,
44+
) -> ModelResponse:
45+
_model = AnacondaQuantizedModelCache(name=model)
46+
_service = _model.start(**optional_params.pop("llama_cpp_kwargs", {}))
47+
_client = _service.openai_client
48+
49+
inference_kwargs = self._prepare_inference_kwargs(optional_params)
50+
response = _client.chat.completions.create(
51+
messages=messages, model=model, **inference_kwargs
52+
)
53+
mresponse = ModelResponse(**response.model_dump())
54+
_service.options["Process"].terminate()
55+
return mresponse
56+
57+
def streaming(
58+
self,
59+
model: str,
60+
messages: list,
61+
api_base: str,
62+
custom_prompt_dict: dict,
63+
model_response: ModelResponse,
64+
print_verbose: Callable,
65+
encoding: Any,
66+
api_key: Any,
67+
logging_obj: Any,
68+
optional_params: dict,
69+
acompletion: Optional[AsyncHTTPHandler] = None,
70+
litellm_params: Optional[Any] = None,
71+
logger_fn: Optional[Any] = None,
72+
headers: Optional[dict] = None,
73+
timeout: Optional[Union[float, Timeout]] = None,
74+
client: Optional[HTTPHandler] = None,
75+
) -> Iterator[GenericStreamingChunk]:
76+
_model = AnacondaQuantizedModelCache(name=model)
77+
_service = _model.start(**optional_params.pop("llama_cpp_kwargs", {}))
78+
_client = _service.openai_client
79+
80+
inference_kwargs = self._prepare_inference_kwargs(optional_params)
81+
response = _client.chat.completions.create(
82+
messages=messages, model=model, stream=True, **inference_kwargs
83+
)
84+
wrapped = CustomStreamWrapper(
85+
custom_llm_provider="openai",
86+
completion_stream=response,
87+
model=model,
88+
logging_obj=logging_obj,
89+
)
90+
91+
for chunk in wrapped:
92+
handled = cast(
93+
GenericStreamingChunk,
94+
wrapped.handle_openai_chat_completion_chunk(chunk),
95+
)
96+
yield handled
97+
98+
_service.options["Process"].terminate()
99+
100+
async def acompletion(
101+
self,
102+
model: str,
103+
messages: list,
104+
api_base: str,
105+
custom_prompt_dict: dict,
106+
model_response: ModelResponse,
107+
print_verbose: Callable,
108+
encoding: Any,
109+
api_key: Any,
110+
logging_obj: Any,
111+
optional_params: dict,
112+
acompletion: Optional[AsyncHTTPHandler] = None,
113+
litellm_params: Optional[Any] = None,
114+
logger_fn: Optional[Any] = None,
115+
headers: Optional[dict] = None,
116+
timeout: Optional[Union[float, Timeout]] = None,
117+
client: Optional[AsyncHTTPHandler] = None,
118+
) -> ModelResponse:
119+
_model = AnacondaQuantizedModelCache(name=model)
120+
_service = _model.start(**optional_params.pop("llama_cpp_kwargs", {}))
121+
_client = _service.openai_async_client
122+
123+
inference_kwargs = self._prepare_inference_kwargs(optional_params)
124+
response = await _client.chat.completions.create(
125+
messages=messages, model=model, **inference_kwargs
126+
)
127+
mresponse = ModelResponse(**response.model_dump())
128+
_service.options["Process"].terminate()
129+
return mresponse
130+
131+
async def astreaming( # type: ignore
132+
self,
133+
model: str,
134+
messages: list,
135+
api_base: str,
136+
custom_prompt_dict: dict,
137+
model_response: ModelResponse,
138+
print_verbose: Callable,
139+
encoding: Any,
140+
api_key: Any,
141+
logging_obj: Any,
142+
optional_params: dict,
143+
acompletion: Optional[AsyncHTTPHandler] = None,
144+
litellm_params: Optional[Any] = None,
145+
logger_fn: Optional[Any] = None,
146+
headers: Optional[dict] = None,
147+
timeout: Optional[Union[float, Timeout]] = None,
148+
client: Optional[AsyncHTTPHandler] = None,
149+
) -> AsyncIterator[GenericStreamingChunk]:
150+
_model = AnacondaQuantizedModelCache(name=model)
151+
_service = _model.start(**optional_params.pop("llama_cpp_kwargs", {}))
152+
_client = _service.openai_async_client
153+
154+
inference_kwargs = self._prepare_inference_kwargs(optional_params)
155+
response = await _client.chat.completions.create(
156+
messages=messages, model=model, stream=True, **inference_kwargs
157+
)
158+
wrapped = CustomStreamWrapper(
159+
custom_llm_provider="openai",
160+
completion_stream=response,
161+
model=model,
162+
logging_obj=logging_obj,
163+
)
164+
165+
async for chunk in wrapped:
166+
handled = cast(
167+
GenericStreamingChunk,
168+
wrapped.handle_openai_chat_completion_chunk(chunk),
169+
)
170+
yield handled
171+
172+
_service.options["Process"].terminate()
173+
174+
175+
# This should be moved to an entrypoint if implemented
176+
# https://github.com/BerriAI/litellm/issues/7733
177+
anaconda_llm = AnacondaLLM()
178+
litellm.custom_provider_map.append(
179+
{"provider": "anaconda", "custom_handler": anaconda_llm}
180+
)

0 commit comments

Comments
 (0)
0