masmullin2000
diff --git a/‎crates/llama_cpp/src/detail.rs
Lines changed: 1 addition & 60 deletions b/‎crates/llama_cpp/src/detail.rs
Lines changed: 1 addition & 60 deletions
diff --git a/‎crates/llama_cpp/src/model/mod.rs
Lines changed: 1 addition & 3 deletions b/‎crates/llama_cpp/src/model/mod.rs
Lines changed: 1 addition & 3 deletions
diff --git a/‎crates/llama_cpp/src/session/mod.rs
Lines changed: 24 additions & 54 deletions b/‎crates/llama_cpp/src/session/mod.rs
Lines changed: 24 additions & 54 deletions
diff --git a/‎crates/llama_cpp/src/standard_sampler.rs
Lines changed: 1 addition & 4 deletions b/‎crates/llama_cpp/src/standard_sampler.rs
Lines changed: 1 addition & 4 deletions
@@ -5,73 +5,14 @@
 #![allow(non_snake_case)]
 
 use std::ffi::{c_char, c_void, CStr};
-use std::ptr::slice_from_raw_parts;
 
-use tokio::sync::mpsc::UnboundedSender;
 use tracing::{error, info, trace, warn};
 
 use llama_cpp_sys::{
     ggml_log_level, ggml_log_level_GGML_LOG_LEVEL_ERROR, ggml_log_level_GGML_LOG_LEVEL_INFO,
-    ggml_log_level_GGML_LOG_LEVEL_WARN, llama_beams_state,
+    ggml_log_level_GGML_LOG_LEVEL_WARN,
 };
 
-use crate::Token;
-
-pub(crate) struct BeamSearchState {
-    pub(crate) tx: UnboundedSender<Token>,
-}
-
-#[no_mangle]
-pub(crate) unsafe extern "C" fn llama_beam_search_callback(
-    shared_state_ptr: *mut c_void,
-    beam_state: llama_beams_state,
-) {
-    let shared_state = unsafe {
-        // SAFETY: `channel` has this type and hasn't been de-allocated.
-        &mut *(shared_state_ptr as *mut BeamSearchState)
-    };
-
-    if shared_state.tx.is_closed() {
-        // Close all beams to terminate the search.
-        for i in 0..beam_state.n_beams {
-            unsafe {
-                // SAFETY: beam_views[i] exists where 0 <= i <= n_beams.
-                *beam_state.beam_views.add(i)
-            }
-            .eob = true;
-        }
-    }
-
-    // Llama.cpp trims the common prefix after every invocation; the presence of
-    // `common_prefix_length > 0` means the first `common_prefix_length` tokens have been
-    // settled upon.
-    if beam_state.common_prefix_length > 0 {
-        let first_beam = unsafe {
-            // SAFETY: At least one beam always exists.
-            &*(beam_state.beam_views)
-        };
-
-        let beam_tokens = unsafe {
-            // SAFETY: If all beams share a common prefix, at least that many tokens exist in
-            // every beam.
-            &*slice_from_raw_parts(first_beam.tokens, beam_state.common_prefix_length)
-        };
-
-        for unshared_token in beam_tokens {
-            let _ = shared_state.tx.send(Token(*unshared_token));
-        }
-    }
-
-    if beam_state.last_call {
-        unsafe {
-            // SAFETY: `channel` is heap-allocated, and this is the only time we'll construct
-            // a `Box` back over it; this is the last time this function will be called, and
-            // the last time this pointer will be seen.
-            let _ = Box::from_raw(shared_state);
-        }
-    }
-}
-
 #[no_mangle]
 pub(crate) unsafe extern "C" fn llama_log_callback(
     level: ggml_log_level,
 
@@ -5,14 +5,12 @@ use std::cmp::min;
 use std::ffi::{c_char, CStr, CString};
 use std::path::{Path, PathBuf};
 use std::ptr::slice_from_raw_parts;
-use std::sync::{atomic::AtomicUsize, Arc};
+use std::sync::{atomic::AtomicUsize, Arc, Mutex, RwLock};
 use std::usize;
 
 use derive_more::{Deref, DerefMut};
 use futures::executor::block_on;
 use thiserror::Error;
-use tokio::sync::Mutex;
-use tokio::sync::RwLock;
 use tracing::{error, info, trace, warn};
 
 use backend::BackendRef;
 
@@ -1,24 +1,23 @@
 //! Functionality for the [`LlamaSession`] struct
 
 use std::cmp::min;
-use std::ffi::c_void;
 use std::ops::{Bound, RangeBounds};
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;
+use std::sync::{Arc, Mutex, RwLock};
 use std::thread;
 
-use futures::executor::block_on;
 use thiserror::Error;
-use tokio::sync::{mpsc::unbounded_channel, Mutex, RwLock};
+use tokio::sync::mpsc::unbounded_channel;
 use tracing::{error, info, trace, warn};
 
 use llama_cpp_sys::{
-    llama_beam_search, llama_context, llama_copy_state_data, llama_decode, llama_free,
+    llama_context, llama_copy_state_data, llama_decode, llama_free,
     llama_get_logits_ith, llama_get_state_size, llama_kv_cache_seq_rm, llama_set_state_data,
     llama_token_data, llama_token_data_array,
 };
 
-use crate::{detail, LlamaModel, LlamaTokenizationError, Sampler, Token};
+use crate::standard_sampler::StandardSampler;
+use crate::{LlamaModel, LlamaTokenizationError, Sampler, Token};
 
 mod completion;
 mod params;
@@ -172,7 +171,7 @@ impl LlamaSession {
             let err = unsafe {
                 // SAFETY: `llama_decode` will not fail for a valid `batch`, which we correctly
                 // initialized above.
-                llama_decode(block_on(self.inner.ctx.lock()).ptr, batch.handle())
+                llama_decode(self.inner.ctx.lock().unwrap().ptr, batch.handle())
             };
             if err != 0 {
                 return Err(LlamaContextError::DecodeFailed(err));
@@ -182,7 +181,7 @@ impl LlamaSession {
             last_batch_size = sequence.len();
         }
 
-        block_on(self.inner.tokens.write()).extend_from_slice(tokens);
+        self.inner.tokens.write().unwrap().extend_from_slice(tokens);
 
         self.inner
             .last_batch_size
@@ -236,34 +235,13 @@ impl LlamaSession {
             .unwrap()
     }
 
-    /// Starts generating tokens at the end of the context using llama.cpp's built-in Beam search.
-    /// TODO fix: beam search keeps going even after it should have ended
+    /// Starts generating tokens at the end of the context using a greedy
+    /// sampler
     pub fn start_completing(&mut self) -> CompletionHandle {
-        let (tx, rx) = unbounded_channel();
-        let history_size = self.context_size();
-        let session = self.clone();
-
-        info!("Generating completions with {history_size} tokens of history");
-
-        thread::spawn(move || unsafe {
-            let state = Box::new(detail::BeamSearchState { tx });
-            // SAFETY: `state_ptr` is converted back to a [`Box`] and freed in [`detail::llama_beam_search_callback`]
-            let state_ptr = Box::into_raw(state);
-
-            llama_beam_search(
-                block_on(session.inner.ctx.lock()).ptr,
-                Some(detail::llama_beam_search_callback),
-                state_ptr as *mut _ as *mut c_void,
-                1,
-                history_size as i32,
-                32_768,
-            );
-        });
-
-        CompletionHandle {
-            rx,
-            model: self.model(),
-        }
+        self.start_completing_with(
+            StandardSampler::new_greedy(),
+            self.params().n_ctx as usize - self.context_size(),
+        )
     }
 
     /// Start completion.
@@ -282,10 +260,10 @@ impl LlamaSession {
         info!("Generating completions with {history_size} tokens of history");
 
         thread::spawn(move || {
-            let context = block_on(session.inner.ctx.lock());
+            let context = session.inner.ctx.lock().unwrap();
             let vocab = session.model().vocabulary_size();
             let end_of_stream = session.model().eos();
-            let mut token_buf = block_on(session.inner.tokens.write());
+            let mut token_buf = session.inner.tokens.write().unwrap();
             let mut count = 0;
             let mut batch = Batch::new(1, 0, 1);
             let mut i = session.inner.last_batch_size.load(Ordering::SeqCst);
@@ -366,12 +344,12 @@ impl LlamaSession {
 
     /// Returns the number of tokens currently in this session's context
     pub fn context_size(&self) -> usize {
-        block_on(self.inner.tokens.read()).len()
+        self.inner.tokens.read().unwrap().len()
     }
 
     /// Returns the list of tokens in the current context
     pub fn context(&self) -> Vec<Token> {
-        block_on(self.inner.tokens.read()).clone()
+        self.inner.tokens.read().unwrap().clone()
     }
 
     /// Removes all tokens within the given range without perform
C95D
ing any prompt
@@ -393,12 +371,12 @@ impl LlamaSession {
             Bound::Unbounded => -1,
         };
 
-        let context = block_on(self.inner.ctx.lock());
+        let context = self.inner.ctx.lock().unwrap();
 
         // -1 here to match all sequences
         unsafe { llama_kv_cache_seq_rm(context.ptr, -1, start_bound, end_bound) }
 
-        block_on(self.inner.tokens.write()).drain(range);
+        self.inner.tokens.write().unwrap().drain(range);
     }
 
     /// Removes all but the first `n_tokens` tokens from the context.
@@ -415,7 +393,7 @@ impl LlamaSession {
         new_tokens: impl AsRef<[Token]>,
     ) -> Result<(), LlamaContextError> {
         let new_tokens = new_tokens.as_ref();
-        let old_tokens = block_on(self.inner.tokens.read());
+        let old_tokens = self.inner.tokens.read().unwrap();
 
         let shared_prefix = old_tokens
             .iter()
@@ -480,7 +458,7 @@ impl LlamaSession {
     /// This differs from [`LlamaSession::clone`] in that [`LlamaSession::clone`] creates a new
     /// reference to the same underlying [`LlamaSession`].
     pub fn deep_copy(&self) -> Result<LlamaSession, LlamaContextError> {
-        let ctx = self.inner.ctx.blocking_lock();
+        let ctx = self.inner.ctx.lock().unwrap();
 
         #[allow(unused_mut)]
         let mut copy = self.model().create_session(self.inner.params.clone())?;
             let copy_size = llama_copy_state_data(ctx.ptr, buf.as_mut_ptr());
             assert!(copy_size <= size);
             let set_size =
-                llama_set_state_data(copy.inner.ctx.blocking_lock().ptr, buf.as_mut_ptr());
+                llama_set_state_data(copy.inner.ctx.lock().unwrap().ptr, buf.as_mut_ptr());
             assert_eq!(copy_size, set_size);
         }
 
         // NOTE: Any changes to the fields of a LlamaSession may require that
         // those changes are mirrored here
-        *block_on(copy.inner.tokens.write()) = block_on(self.inner.tokens.read()).clone();
+        *copy.inner.tokens.write().unwrap() = self.inner.tokens.read().unwrap().clone();
         copy.inner.last_batch_size.store(
             self.inner.last_batch_size.load(Ordering::SeqCst),
             Ordering::SeqCst,
@@ -512,16 +490,8 @@ impl LlamaSession {
     }
 
     /// Returns the maximum size in bytes this session is occupying in memory.
-    ///
-    /// This function may **NOT*** be called in async environments, for an async version see [`async_memory_size`].
     pub fn memory_size(&self) -> usize {
-        let ctx = self.inner.ctx.blocking_lock();
-        unsafe { llama_get_state_size(ctx.ptr) }
-    }
-
-    /// Asynchronously returns the maximum size in bytes this session is occupying in memory.
-    pub async fn async_memory_size(&self) -> usize {
-        let ctx = self.inner.ctx.lock().await;
+        let ctx = self.inner.ctx.lock().unwrap();
         unsafe { llama_get_state_size(ctx.ptr) }
     }
 }
@@ -296,10 +296,7 @@ impl StandardSampler {
     ///
     /// Ensures that at least `min_keep` tokens remain after the
     /// [`SamplerStage`]'s are applied.
-    pub fn new_softmax(
-        stages: Vec<SamplerStage>,
-        min_keep: usize,
-    ) -> StandardSampler {
+    pub fn new_softmax(stages: Vec<SamplerStage>, min_keep: usize) -> StandardSampler {
         StandardSampler {
             stages,
             min_keep,