diff --git a/crates/edgen_core/src/llm.rs b/crates/edgen_core/src/llm.rs index 3be07d4..61ee192 100644 --- a/crates/edgen_core/src/llm.rs +++ b/crates/edgen_core/src/llm.rs @@ -49,6 +49,7 @@ pub struct CompletionArgs { pub one_shot: bool, pub seed: Option, pub frequency_penalty: f32, + pub context_hint: Option, } impl Default for CompletionArgs { @@ -58,6 +59,7 @@ impl Default for CompletionArgs { one_shot: false, seed: None, frequency_penalty: 0.0, + context_hint: None, } } } diff --git a/crates/edgen_rt_llama_cpp/src/lib.rs b/crates/edgen_rt_llama_cpp/src/lib.rs index 64effbd..a4ff332 100644 --- a/crates/edgen_rt_llama_cpp/src/lib.rs +++ b/crates/edgen_rt_llama_cpp/src/lib.rs @@ -247,7 +247,7 @@ impl UnloadingModel { //params.seed = args.seed; params.n_threads = threads; params.n_threads_batch = threads; - params.n_ctx = CONTEXT_SIZE; + params.n_ctx = args.context_hint.unwrap_or(CONTEXT_SIZE); let mut session = model_guard .create_session(params) @@ -306,7 +306,7 @@ impl UnloadingModel { //params.seed = args.seed; params.n_threads = threads; params.n_threads_batch = threads; - params.n_ctx = CONTEXT_SIZE; + params.n_ctx = args.context_hint.unwrap_or(CONTEXT_SIZE); let session = model_guard .create_session(params) diff --git a/crates/edgen_server/src/openai_shim.rs b/crates/edgen_server/src/openai_shim.rs index c8a6ec7..ae580ce 100644 --- a/crates/edgen_server/src/openai_shim.rs +++ b/crates/edgen_server/src/openai_shim.rs @@ -367,6 +367,13 @@ pub struct CreateChatCompletionRequest<'a> { /// Indicate if this is an isolated request, with no associated past or future context. This may allow for /// optimisations in some implementations. Default: `false` pub one_shot: Option, + + /// A hint for how big a context will be. + /// + /// # Warning + /// An unsound hint may severely drop performance and/or inference quality, and in some cases even cause Edgen + /// to crash. Do not set this value unless you know what you are doing. + pub context_hint: Option, } /// A message in a chat completion. @@ -633,6 +640,7 @@ pub async fn chat_completions( let mut args = CompletionArgs { prompt: untokenized_context, seed: req.seed, + context_hint: req.context_hint, ..Default::default() };