@@ -280,16 +280,16 @@ llama_context::llama_context(
280
280
281
281
// simulate full KV cache
282
282
283
- const auto mstate = memory->init_full ();
284
- if (!mstate ) {
283
+ const auto mctx = memory->init_full ();
284
+ if (!mctx ) {
285
285
throw std::runtime_error (" failed to initialize KV cache" );
286
286
}
287
287
288
288
cross.v_embd .clear ();
289
289
290
290
// reserve pp graph first so that buffers are only allocated once
291
291
{
292
- auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mstate .get ());
292
+ auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mctx .get ());
293
293
if (!gf) {
294
294
throw std::runtime_error (" failed to allocate compute pp buffers" );
295
295
}
@@ -300,7 +300,7 @@ llama_context::llama_context(
300
300
301
301
// reserve with tg graph to get the number of splits and nodes
302
302
{
303
- auto * gf = graph_reserve (1 , 1 , 1 , mstate .get ());
303
+ auto * gf = graph_reserve (1 , 1 , 1 , mctx .get ());
304
304
if (!gf) {
305
305
throw std::runtime_error (" failed to allocate compute tg buffers" );
306
306
}
@@ -311,7 +311,7 @@ llama_context::llama_context(
311
311
312
312
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
313
313
{
314
- auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mstate .get ());
314
+ auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mctx .get ());
315
315
if (!gf) {
316
316
throw std::runtime_error (" failed to allocate compute pp buffers" );
317
317
}
@@ -444,8 +444,8 @@ bool llama_context::kv_self_update(bool optimize) {
444
444
optimize |= memory_force_optimize;
445
445
memory_force_optimize = false ;
446
446
447
- const auto mstate = memory->init_update (this , optimize);
448
- switch (mstate ->get_status ()) {
447
+ const auto mctx = memory->init_update (this , optimize);
448
+ switch (mctx ->get_status ()) {
449
449
case LLAMA_MEMORY_STATUS_SUCCESS:
450
450
{
451
451
// noop
@@ -463,22 +463,22 @@ bool llama_context::kv_self_update(bool optimize) {
463
463
}
464
464
}
465
465
466
- if (!mstate ->apply ()) {
466
+ if (!mctx ->apply ()) {
467
467
LLAMA_LOG_ERROR (" %s: failed to apply memory update\n " , __func__);
468
468
}
469
469
}
470
470
471
471
// if the memory module did any computation, we have to reserve a new worst-case graph
472
472
{
473
- const auto mstate = memory->init_full ();
474
- if (!mstate ) {
475
- throw std::runtime_error (" failed to initialize memory state " );
473
+ const auto mctx = memory->init_full ();
474
+ if (!mctx ) {
475
+ throw std::runtime_error (" failed to initialize memory context " );
476
476
}
477
477
478
478
const uint32_t n_seqs = cparams.n_seq_max ;
479
479
const uint32_t n_tokens = std::min (cparams.n_ctx , cparams.n_ubatch );
480
480
481
- auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mstate .get ());
481
+ auto * gf = graph_reserve (n_tokens, n_seqs, n_tokens, mctx .get ());
482
482
if (!gf) {
483
483
LLAMA_LOG_ERROR (" %s: failed to reserve graph after the memory update\n " , __func__);
484
484
}
@@ -678,9 +678,9 @@ bool llama_context::apply_adapter_cvec(
678
678
return cvec.apply (model, data, len, n_embd, il_start, il_end);
679
679
}
680
680
681
- llm_graph_result_ptr llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_state_i * mstate , ggml_status & ret) {
682
- if (mstate && !mstate ->apply ()) {
683
- LLAMA_LOG_ERROR (" %s: failed to apply memory state \n " , __func__);
681
+ llm_graph_result_ptr llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx , ggml_status & ret) {
682
+ if (mctx && !mctx ->apply ()) {
683
+ LLAMA_LOG_ERROR (" %s: failed to apply memory context \n " , __func__);
684
684
ret = GGML_STATUS_FAILED;
685
685
return nullptr ;
686
686
}
@@ -692,7 +692,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
692
692
return nullptr ;
693
693
}
694
694
695
- auto res = graph_build (ctx_compute.get (), gf, ubatch, gtype, mstate );
695
+ auto res = graph_build (ctx_compute.get (), gf, ubatch, gtype, mctx );
696
696
if (!res) {
697
697
LLAMA_LOG_ERROR (" %s: failed to build graph\n " , __func__);
698
698
ret = GGML_STATUS_FAILED;
@@ -933,21 +933,21 @@ int llama_context::decode(const llama_batch & batch_inp) {
933
933
// handle any pending defrags/shifts
934
934
kv_self_update (false );
935
935
936
- llama_memory_state_ptr mstate ;
936
+ llama_memory_context_ptr mctx ;
937
937
938
938
while (true ) {
939
- mstate = memory->init_batch (*balloc, cparams.n_ubatch , output_all);
940
- if (!mstate ) {
939
+ mctx = memory->init_batch (*balloc, cparams.n_ubatch , output_all);
940
+ if (!mctx ) {
941
941
return -2 ;
942
942
}
943
943
944
- switch (mstate ->get_status ()) {
944
+ switch (mctx ->get_status ()) {
945
945
case LLAMA_MEMORY_STATUS_SUCCESS:
946
946
{
947
947
} break ;
948
948
case LLAMA_MEMORY_STATUS_NO_UPDATE:
949
949
{
950
- LLAMA_LOG_ERROR (" %s: unexpected memory state status: %d\n " , __func__, mstate ->get_status ());
950
+ LLAMA_LOG_ERROR (" %s: unexpected memory context status: %d\n " , __func__, mctx ->get_status ());
951
951
952
952
return -2 ;
953
953
}
@@ -987,7 +987,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
987
987
int64_t n_outputs_prev = 0 ;
988
988
989
989
do {
990
- const auto & ubatch = mstate ->get_ubatch ();
990
+ const auto & ubatch = mctx ->get_ubatch ();
991
991
992
992
// count the outputs in this ubatch
993
993
{
@@ -1009,7 +1009,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1009
1009
ggml_backend_sched_set_eval_callback (sched.get (), cparams.cb_eval , cparams.cb_eval_user_data );
1010
1010
1011
1011
ggml_status status;
1012
- const auto res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mstate .get (), status);
1012
+ const auto res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx .get (), status);
1013
1013
1014
1014
if (!res) {
1015
1015
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@@ -1126,7 +1126,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
1126
1126
}
1127
1127
1128
1128
n_outputs_prev += n_outputs;
1129
- } while (mstate ->next ());
1129
+ } while (mctx ->next ());
1130
1130
1131
1131
// set to total number of outputs in the batch, for use in llama_get_logits_ith
1132
1132
n_outputs = n_outputs_all;
@@ -1292,7 +1292,7 @@ ggml_cgraph * llama_context::graph_init() {
1292
1292
return ggml_new_graph_custom (ctx_compute.get (), graph_max_nodes (), false );
1293
1293
}
1294
1294
1295
- ggml_cgraph * llama_context::graph_reserve (uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate ) {
1295
+ ggml_cgraph * llama_context::graph_reserve (uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx ) {
1296
1296
LLAMA_LOG_DEBUG (" %s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n " , __func__, n_tokens, n_seqs, n_outputs);
1297
1297
1298
1298
if (n_tokens % n_seqs != 0 ) {
@@ -1312,7 +1312,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1312
1312
llama_ubatch ubatch = balloc.ubatch_reserve (n_tokens/n_seqs, n_seqs);
1313
1313
1314
1314
auto * gf = graph_init ();
1315
- auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate );
1315
+ auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx );
1316
1316
1317
1317
this ->n_outputs = save_n_outputs;
1318
1318
@@ -1333,11 +1333,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1333
1333
}
1334
1334
1335
1335
llm_graph_result_ptr llama_context::graph_build (
1336
- ggml_context * ctx,
1337
- ggml_cgraph * gf,
1338
- const llama_ubatch & ubatch,
1339
- llm_graph_type gtype,
1340
- const llama_memory_state_i * mstate ) {
1336
+ ggml_context * ctx,
1337
+ ggml_cgraph * gf,
1338
+ const llama_ubatch & ubatch,
1339
+ llm_graph_type gtype,
1340
+ const llama_memory_context_i * mctx ) {
1341
1341
return model.build_graph (
1342
1342
{
1343
1343
/* .ctx =*/ ctx,
@@ -1349,7 +1349,7 @@ llm_graph_result_ptr llama_context::graph_build(
1349
1349
/* .backend_cpu =*/ backend_cpu,
1350
1350
/* .cvec =*/ &cvec,
1351
1351
/* .loras =*/ &loras,
1352
- /* .mstate =*/ mstate ,
1352
+ /* .mctx =*/ mctx ,
1353
1353
/* .cross =*/ &cross,
1354
1354
/* .n_outputs =*/ n_outputs,
1355
1355
/* .cb =*/ graph_get_cb (),
@@ -2042,8 +2042,8 @@ void llama_context::opt_epoch_iter(
2042
2042
2043
2043
uint32_t n_outputs_all = n_tokens_all;
2044
2044
2045
- auto mstate = memory->init_batch (*balloc, cparams.n_ubatch , true );
2046
- if (!mstate || mstate ->get_status () != LLAMA_MEMORY_STATUS_SUCCESS) {
2045
+ auto mctx = memory->init_batch (*balloc, cparams.n_ubatch , true );
2046
+ if (!mctx || mctx ->get_status () != LLAMA_MEMORY_STATUS_SUCCESS) {
2047
2047
LLAMA_LOG_ERROR (" %s: could not initialize batch\n " , __func__);
2048
2048
break ;
2049
2049
}
@@ -2056,17 +2056,17 @@ void llama_context::opt_epoch_iter(
2056
2056
2057
2057
uint32_t pos_batch = 0 ;
2058
2058
do {
2059
- const auto & ubatch = mstate ->get_ubatch ();
2059
+ const auto & ubatch = mctx ->get_ubatch ();
2060
2060
2061
2061
n_outputs = ubatch.n_tokens ;
2062
2062
2063
- if (!mstate ->apply ()) {
2064
- LLAMA_LOG_ERROR (" %s: failed to update the memory state \n " , __func__);
2063
+ if (!mctx ->apply ()) {
2064
+ LLAMA_LOG_ERROR (" %s: failed to update the memory context \n " , __func__);
2065
2065
break ;
2066
2066
}
2067
2067
2068
2068
auto * gf = graph_init ();
2069
- auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate .get ());
2069
+ auto res = graph_build (ctx_compute.get (), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx .get ());
2070
2070
2071
2071
struct ggml_context * ctx_compute_opt;
2072
2072
{
@@ -2101,7 +2101,7 @@ void llama_context::opt_epoch_iter(
2101
2101
ggml_free (ctx_compute_opt);
2102
2102
2103
2103
pos_batch += ubatch.n_tokens ;
2104
- } while (mstate ->next ());
2104
+ } while (mctx ->next ());
2105
2105
}
2106
2106
}
2107
2107
0 commit comments