@@ -314,10 +314,14 @@ async def create_completion(
314
314
else:
315
315
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
316
316
317
- iterator_or_completion: Union[
318
- llama_cpp.CreateCompletionResponse,
319
- Iterator[llama_cpp.CreateCompletionStreamResponse],
320
- ] = await run_in_threadpool(llama, **kwargs)
317
+ try:
318
+ iterator_or_completion: Union[
319
+ llama_cpp.CreateCompletionResponse,
320
+ Iterator[llama_cpp.CreateCompletionStreamResponse],
321
+ ] = await run_in_threadpool(llama, **kwargs)
322
+ except Exception as err:
323
+ exit_stack.close()
324
+ raise err
321
325
322
326
if isinstance(iterator_or_completion, Iterator):
323
327
# EAFP: It's easier to ask for forgiveness than permission
@@ -344,6 +348,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
344
348
ping_message_factory=_ping_message_factory,
345
349
)
346
350
else:
351
+ exit_stack.close()
347
352
return iterator_or_completion
348
353
349
354
@@ -508,9 +513,13 @@ async def create_chat_completion(
508
513
else:
509
514
kwargs["logits_processor"].extend(_min_tokens_logits_processor)
510
515
511
- iterator_or_completion: Union[
512
- llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
513
- ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
516
+ try:
517
+ iterator_or_completion: Union[
518
+ llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
519
+ ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
520
+ except Exception as err:
521
+ exit_stack.close()
522
+ raise err
514
523
515
524
if isinstance(iterator_or_completion, Iterator):
516
525
# EAFP: It's easier to ask for forgiveness than permission
0 commit comments