SciSharp
diff --git a/‎LLama.Examples/LLama.Examples.csproj‎
Lines changed: 1 addition & 0 deletions b/‎LLama.Examples/LLama.Examples.csproj‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎LLama.Examples/NewVersion/BatchedDecoding.cs‎
Lines changed: 177 additions & 0 deletions b/‎LLama.Examples/NewVersion/BatchedDecoding.cs‎
Lines changed: 177 additions & 0 deletions
diff --git a/‎LLama.Examples/NewVersion/SemanticKernelChat.cs‎
Lines changed: 1 addition & 4 deletions b/‎LLama.Examples/NewVersion/SemanticKernelChat.cs‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎LLama.Examples/NewVersion/SemanticKernelPrompt.cs‎
Lines changed: 1 addition & 4 deletions b/‎LLama.Examples/NewVersion/SemanticKernelPrompt.cs‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎LLama.Examples/NewVersion/TalkToYourself.cs‎
Lines changed: 1 addition & 4 deletions b/‎LLama.Examples/NewVersion/TalkToYourself.cs‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎LLama.Examples/NewVersion/TestRunner.cs‎
Lines changed: 5 additions & 0 deletions b/‎LLama.Examples/NewVersion/TestRunner.cs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎LLama.Unittest/StatelessExecutorTest.cs‎
Lines changed: 8 additions & 0 deletions b/‎LLama.Unittest/StatelessExecutorTest.cs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎LLama.Web/Common/InferenceOptions.cs‎
Lines changed: 1 addition & 1 deletion b/‎LLama.Web/Common/InferenceOptions.cs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LLama.Web/Common/ModelOptions.cs‎
Lines changed: 2 additions & 2 deletions b/‎LLama.Web/Common/ModelOptions.cs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎LLama/Abstractions/IContextParams.cs‎
Lines changed: 4 additions & 4 deletions b/‎LLama/Abstractions/IContextParams.cs‎
Lines changed: 4 additions & 4 deletions
@@ -8,6 +8,7 @@
     <Platforms>AnyCPU;x64</Platforms>
     <!-- Set IncludeBuiltInRuntimes to false to include your own runtime libraries and not link the defaults -->
     <IncludeBuiltInRuntimes>true</IncludeBuiltInRuntimes>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
 
@@ -0,0 +1,177 @@
+using System.Diagnostics;
+using System.Security.Cryptography;
+using System.Text;
+using LLama.Common;
+using LLama.Native;
+
+namespace LLama.Examples.NewVersion;
+
+/// <summary>
+/// This demonstrates generating multiple replies to the same prompt, with a shared cache
+/// </summary>
+/// <remarks>Note that this is currently using the low level API directly, future work will provide a safer C# wrapper over this!</remarks>
+public class BatchedDecoding
+{
+    private const int n_parallel = 8;
+    private const int n_len = 32;
+
+    private const int top_k = 80;
+    private const float top_p = 0.8f;
+    private const float temp = 0.5f;
+
+    public static async Task Run()
+    {
+        Console.Write("Please input your model path: ");
+        var modelPath = Console.ReadLine();
+
+        Console.WriteLine("Prompt (leave blank to select automatically):");
+        var prompt = Console.ReadLine();
+        if (string.IsNullOrWhiteSpace(prompt))
+            prompt = "Not many people know that";
+
+        // Load model
+        var parameters = new ModelParams(modelPath);
+        using var model = LLamaWeights.LoadFromFile(parameters);
+
+        // Tokenize prompt
+        var prompt_tokens = model.NativeHandle.Tokenize(prompt, true, false, Encoding.UTF8);
+        var n_kv_req = prompt_tokens.Length + (n_len - prompt_tokens.Length) * n_parallel;
+
+        // Create a context
+        parameters.ContextSize = (uint)model.ContextSize;
+        parameters.BatchSize = (uint)Math.Max(n_len, n_parallel);
+        using var context = model.CreateContext(parameters);
+
+        var n_ctx = context.ContextSize;
+
+        // make sure the KV cache is big enough to hold all the prompt and generated tokens
+        if (n_kv_req > n_ctx)
+        {
+            await Console.Error.WriteLineAsync($"error: n_kv_req ({n_kv_req}) > n_ctx, the required KV cache size is not big enough\n");
+            await Console.Error.WriteLineAsync("        either reduce n_parallel or increase n_ctx\n");
+            return;
+        }
+
+        using var batch = LLamaBatchSafeHandle.Create(Math.Max(prompt_tokens.Length, n_parallel), 0, 1);
+
+        // evaluate the initial prompt
+        for (var i = 0; i < prompt_tokens.Length; i++)
+            batch.LLamaBatchAdd(prompt_tokens[i], i, new[] { (LLamaSeqId)0 }, false);
+        Debug.Assert(batch.NativeBatch.n_tokens == prompt_tokens.Length);
+
+        // llama_decode will output logits only for the last token of the prompt
+        unsafe
+        {
+            batch.NativeBatch.logits[batch.NativeBatch.n_tokens - 1] = 1;
+        }
+
+        if (context.NativeHandle.Decode(batch) != 0)
+        {
+            await Console.Error.WriteLineAsync("llama_decode failed");
+            return;
+        }
+
+        // assign the system KV cache to all parallel sequences
+        // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+        for (var i = 1; i < n_parallel; ++i)
+        {
+            NativeApi.llama_kv_cache_seq_cp(context.NativeHandle, (LLamaSeqId)0, (LLamaSeqId)i, 0, batch.NativeBatch.n_tokens);
+        }
+
+        if (n_parallel > 1)
+        {
+            Console.WriteLine();
+            Console.WriteLine($"generating {n_parallel} sequences...");
+        }
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        List<int> i_batch = new();
+        for (var i = 0; i < n_parallel; i++)
+            i_batch.Add(batch.NativeBatch.n_tokens - 1);
+
+        var n_cur = batch.NativeBatch.n_tokens;
+        var n_decode = 0;
+
+        var streams = new List<int>[n_parallel];
+        for (var i = 0; i < n_parallel; i++)
+            streams[i] = new();
+
+        var eos = model.EndOfSentenceToken;
+        var nl = model.NewlineToken;
+
+        var timer = new Stopwatch();
+        timer.Start();
+        while (n_cur <= n_len)
+        {
+            batch.LLamaBatchClear();
+
+            for (var i = 0; i < n_parallel; i++)
+            {
+                // Skip completed streams
+                if (i_batch[i] < 0)
+                    continue;
+
+                var n_vocab = model.VocabCount;
+                LLamaTokenDataArray candidates;
+                unsafe
+                {
+                    candidates = LLamaTokenDataArray.Create(new Span<float>(NativeApi.llama_get_logits_ith(context.NativeHandle, i_batch[i]), n_vocab));
+                }
+
+                candidates.TopK(context.NativeHandle, top_k);
+                candidates.TopP(context.NativeHandle, top_p);
+                candidates.Temperature(context.NativeHandle, temp);
+                var new_token_id = candidates.SampleToken(context.NativeHandle);
+
+                if (new_token_id == eos || new_token_id == nl)
+                {
+                    i_batch[i] = -1;
+                    Console.WriteLine($"Completed Stream {i} early");
+                    continue;
+                }
+
+                streams[i].Add(new_token_id);
+
+                i_batch[i] = batch.NativeBatch.n_tokens;
+
+                // push this new token for next evaluation
+                batch.LLamaBatchAdd(new_token_id, n_cur, new[] { (LLamaSeqId)i }, true);
+
+                n_decode++;
+            }
+
+            // all streams are finished
+            if (batch.NativeBatch.n_tokens == 0)
+            {
+                break;
+            }
+
+            n_cur++;
+
+            // evaluate the current batch with the transformer model
+            if (context.NativeHandle.Decode(batch) != 0)
+            {
+                await Console.Error.WriteLineAsync("failed to eval");
+                return;
+            }
+        }
+
+        timer.Stop();
+        Console.ForegroundColor = ConsoleColor.Yellow;
+        Console.WriteLine();
+        Console.WriteLine($"Decoded {n_decode} tokens in {timer.ElapsedMilliseconds}ms");
+        Console.WriteLine($"Rate: {n_decode / timer.Elapsed.TotalSeconds:##.000} tokens/second");
+
+        var index = 0;
+        foreach (var stream in streams)
+        {
+            var text = context.DeTokenize(stream);
+
+            Console.ForegroundColor = ConsoleColor.Green;
+            Console.Write($"{index++}. {prompt}");
+            Console.ForegroundColor = ConsoleColor.Red;
+            Console.WriteLine(text);
+        }
+    }
+}
@@ -14,10 +14,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var parameters = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue)),
-            };
+            var parameters = new ModelParams(modelPath);
             using var model = LLamaWeights.LoadFromFile(parameters);
             using var context = model.CreateContext(parameters);
             var ex = new InteractiveExecutor(context);
 
@@ -16,10 +16,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var parameters = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
-            };
+            var parameters = new ModelParams(modelPath);
             using var model = LLamaWeights.LoadFromFile(parameters);
             var ex = new StatelessExecutor(model, parameters);
 
 
@@ -13,10 +13,7 @@ public static async Task Run()
             var modelPath = Console.ReadLine();
 
             // Load weights into memory
-            var @params = new ModelParams(modelPath)
-            {
-                Seed = unchecked((uint)RandomNumberGenerator.GetInt32(int.MaxValue))
-            };
+            var @params = new ModelParams(modelPath);
             using var weights = LLamaWeights.LoadFromFile(@params);
 
             // Create 2 contexts sharing the same weights
 
@@ -22,6 +22,7 @@ public static async Task Run()
             Console.WriteLine("12: Semantic Kernel Chat.");
             Console.WriteLine("13: Semantic Kernel Memory.");
             Console.WriteLine("14: Coding Assistant.");
+            Console.WriteLine("15: Batch Decoding.");
 
             while (true)
             {
@@ -88,6 +89,10 @@ public static async Task Run()
                 {
                     await CodingAssistant.Run();
                 }
+                else if (choice == 15)
+                {
+                    await BatchedDecoding.Run();
+                }
                 else
                 {
                     Console.WriteLine("Cannot parse your choice. Please select again.");
 
@@ -1,3 +1,4 @@
+using System.Diagnostics;
 using LLama.Common;
 using Xunit.Abstractions;
 
@@ -34,10 +35,17 @@ public async Task Stateless()
             const string question = "Question. what is a cat?\nAnswer: ";
             var @params = new InferenceParams { MaxTokens = 32, AntiPrompts = new[] { "." } };
 
+            var timer = new Stopwatch();
+            timer.Start();
+
             var result1 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());
             var result2 = string.Join("", await executor.InferAsync(question, @params).ToListAsync());
 
+            timer.Stop();
+            _testOutputHelper.WriteLine($"{timer.ElapsedMilliseconds}ms");
+
             _testOutputHelper.WriteLine(result1);
+            _testOutputHelper.WriteLine(result2);
 
             // Check that it produced the exact same result both times
             Assert.Equal(result1, result2);
 
@@ -23,7 +23,7 @@ public class InferenceOptions : IInferenceParams
         /// <summary>
         /// Sequences where the model will stop generating further tokens.
         /// </summary>
-        public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
+        public IReadOnlyList<string> AntiPrompts { get; set; } = Array.Empty<string>();
         /// <summary>
         /// path to file for saving/loading model eval state
         /// </summary>
 
@@ -111,12 +111,12 @@ public class ModelOptions
         /// <summary>
         /// RoPE base frequency
         /// </summary>
-        public float RopeFrequencyBase { get; set; } = 10000.0f;
+        public float? RopeFrequencyBase { get; set; }
 
         /// <summary>
         /// RoPE frequency scaling factor
         /// </summary>
-        public float RopeFrequencyScale { get; set; } = 1.0f;
+        public float? RopeFrequencyScale { get; set; }
 
         /// <summary>
         /// Use experimental mul_mat_q kernels
 
@@ -39,14 +39,14 @@ public interface IContextParams
     bool EmbeddingMode { get; set; }
 
     /// <summary>
-    /// RoPE base frequency
+    /// RoPE base frequency (null to fetch from the model)
     /// </summary>
-    float RopeFrequencyBase { get; set; }
+    float? RopeFrequencyBase { get; set; }
 
     /// <summary>
-    /// RoPE frequency scaling factor
+    /// RoPE frequency scaling factor (null to fetch from the model)
     /// </summary>
-    float RopeFrequencyScale { get; set; }
+    float? RopeFrequencyScale { get; set; }
 
     /// <summary>
     /// Use experimental mul_mat_q kernels
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ public static async Task Run()`
`22`	`22`	`Console.WriteLine("12: Semantic Kernel Chat.");`
`23`	`23`	`Console.WriteLine("13: Semantic Kernel Memory.");`
`24`	`24`	`Console.WriteLine("14: Coding Assistant.");`
	`25`	`+ Console.WriteLine("15: Batch Decoding.");`
`25`	`26`
`26`	`27`	`while (true)`
`27`	`28`	`{`
`@@ -88,6 +89,10 @@ public static async Task Run()`
`88`	`89`	`{`
`89`	`90`	`await CodingAssistant.Run();`
`90`	`91`	`}`
	`92`	`+ else if (choice == 15)`
	`93`	`+ {`
	`94`	`+ await BatchedDecoding.Run();`
	`95`	`+ }`
`91`	`96`	`else`
`92`	`97`	`{`
`93`	`98`	`Console.WriteLine("Cannot parse your choice. Please select again.");`