Skip to content

Commit 9a6e8b5

Browse files
authored
Merge pull request #712 from martindevans/may-2024-binary-update
May 2024 Binary Update (Take 2)
2 parents d8514b3 + 52e4607 commit 9a6e8b5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1627
-1783
lines changed

LLama.Examples/Examples/BatchedExecutorGuidance.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ await AnsiConsole
7979
guidance.Prompt(g);
8080

8181
// Early exit if we reach the natural end of the guided sentence
82-
if (g == model.Tokens.EOS)
82+
if (model.Tokens.IsEndOfGeneration(g))
8383
break;
8484

8585
// Update progress bar

LLama.KernelMemory/LLamaSharp.KernelMemory.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
55
<ImplicitUsings>enable</ImplicitUsings>
66
<Nullable>enable</Nullable>
7-
<Version>0.11.2</Version>
7+
<Version>0.12.0</Version>
88
<Authors>Xbotter</Authors>
99
<Company>SciSharp STACK</Company>
1010
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@@ -17,7 +17,7 @@
1717
The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
1818
</Description>
1919
<PackageReleaseNotes>
20-
v0.11.2 followed the updating of LLamaSharp.
20+
v0.12.0 released with v0.12.0 of LLamaSharp.
2121
</PackageReleaseNotes>
2222
<PackageLicenseExpression>MIT</PackageLicenseExpression>
2323
<PackageOutputPath>packages</PackageOutputPath>

LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
<ImplicitUsings>enable</ImplicitUsings>
1111
<Nullable>enable</Nullable>
1212

13-
<Version>0.11.2</Version>
13+
<Version>0.12.0</Version>
1414
<Authors>Tim Miller, Xbotter</Authors>
1515
<Company>SciSharp STACK</Company>
1616
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@@ -23,7 +23,7 @@
2323
The integration of LLamaSharp and Microsoft semantic-kernel.
2424
</Description>
2525
<PackageReleaseNotes>
26-
v0.11.2 followed the updating of LLamaSharp.
26+
v0.12.0 released with v0.12.0 of LLamaSharp.
2727
</PackageReleaseNotes>
2828
<PackageLicenseExpression>MIT</PackageLicenseExpression>
2929
<PackageOutputPath>packages</PackageOutputPath>

LLama.Web/Common/ModelOptions.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,14 @@ public class ModelOptions
2929
/// <inheritdoc />
3030
public int GpuLayerCount { get; set; } = 20;
3131

32-
public uint SeqMax { get; }
32+
/// <inheritdoc />
33+
public uint SeqMax { get; set; }
3334

3435
/// <inheritdoc />
3536
public uint? Seed { get; set; } = 1686349486;
3637

37-
public bool Embeddings { get; }
38+
/// <inheritdoc />
39+
public bool Embeddings { get; set; }
3840

3941
/// <inheritdoc />
4042
public bool UseMemorymap { get; set; } = true;
@@ -102,6 +104,9 @@ public class ModelOptions
102104
/// <inheritdoc />
103105
public bool NoKqvOffload { get; set; }
104106

107+
/// <inheritdoc />
108+
public bool FlashAttention { get; set; }
109+
105110
/// <inheritdoc />
106111
public Encoding Encoding { get; set; } = Encoding.UTF8;
107112

LLama/Abstractions/IContextParams.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,14 @@ public interface IContextParams
109109
bool NoKqvOffload { get; }
110110

111111
/// <summary>
112+
/// Whether to use flash attention
113+
/// </summary>
114+
bool FlashAttention { get; }
115+
116+
/// <summary>
117+
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt; 0 to disable (default)
112118
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to <see langword="null"/> or &lt; 0 to disable (default)
119+
113120
/// </summary>
114121
float? DefragThreshold { get; }
115122

LLama/Abstractions/IModelParams.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Collections.Generic;
55
using System.ComponentModel;
66
using System.Linq;
7+
using System.Text;
78
using System.Text.Json;
89
using System.Text.Json.Serialization;
910
using LLama.Native;
@@ -241,6 +242,7 @@ public sealed record MetadataOverride
241242
private readonly int _valueInt;
242243
private readonly float _valueFloat;
243244
private readonly bool _valueBool;
245+
private readonly byte[]? _valueString;
244246

245247
/// <summary>
246248
/// Create a new override for an int key
@@ -278,6 +280,21 @@ public MetadataOverride(string key, bool value)
278280
Type = LLamaModelKvOverrideType.Bool;
279281
}
280282

283+
/// <summary>
284+
/// Create a new override for a string key
285+
/// </summary>
286+
/// <param name="key"></param>
287+
/// <param name="value"></param>
288+
public MetadataOverride(string key, string value)
289+
{
290+
Key = key;
291+
_valueString = Encoding.UTF8.GetBytes(value);
292+
Type = LLamaModelKvOverrideType.String;
293+
294+
if (_valueString.Length > 128)
295+
throw new ArgumentException("Value string is too long, must be < 128 UTF8 bytes", nameof(value));
296+
}
297+
281298
internal void WriteValue(ref LLamaModelMetadataOverride dest)
282299
{
283300
switch (Type)
@@ -291,6 +308,13 @@ internal void WriteValue(ref LLamaModelMetadataOverride dest)
291308
case LLamaModelKvOverrideType.Bool:
292309
dest.BoolValue = _valueBool ? -1L : 0;
293310
break;
311+
case LLamaModelKvOverrideType.String:
312+
unsafe
313+
{
314+
fixed (byte* strValPtr = dest.StringValue)
315+
new Span<byte>(_valueString!).CopyTo(new Span<byte>(strValPtr, 128));
316+
}
317+
break;
294318
default:
295319
throw new InvalidEnumArgumentException($"Unknown {nameof(LLamaModelKvOverrideType)} value: {Type}");
296320
}

LLama/Common/ModelParams.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ public record ModelParams
9999
/// <inheritdoc />
100100
public bool NoKqvOffload { get; set; }
101101

102+
/// <inheritdoc />
103+
104+
public bool FlashAttention { get; set; }
105+
102106
/// <inheritdoc />
103107
public float? DefragThreshold { get; set; }
104108

LLama/Extensions/IContextParamsExtensions.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
5050
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
5151
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
5252
result.offload_kqv = !@params.NoKqvOffload;
53+
result.flash_attention = @params.FlashAttention;
5354
result.llama_pooling_type = @params.PoolingType;
5455

5556
result.n_threads = Threads(@params.Threads);

LLama/LLamaSharp.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
<Platforms>AnyCPU;x64;Arm64</Platforms>
88
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>
99

10-
<Version>0.11.2</Version>
10+
<Version>0.12.0</Version>
1111
<Authors>Rinne, Martin Evans, jlsantiago and all the other contributors in https://github.com/SciSharp/LLamaSharp/graphs/contributors.</Authors>
1212
<Company>SciSharp STACK</Company>
1313
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
@@ -22,7 +22,7 @@
2222
With the higher-level APIs and RAG support, it's convenient to deploy LLM (Large Language Model) in your application with LLamaSharp.
2323
</Description>
2424
<PackageReleaseNotes>
25-
LLamaSharp 0.11.2 fixed the performance issue of LLaVA on GPU and improved the log suppression.
25+
Updated llama.cpp version to include better support for LLama3 tokenization.
2626
</PackageReleaseNotes>
2727
<PackageLicenseExpression>MIT</PackageLicenseExpression>
2828
<PackageOutputPath>packages</PackageOutputPath>

LLama/LLamaStatelessExecutor.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Collections.Generic;
55
using System.Linq;
66
using System.Runtime.CompilerServices;
7+
using System.Text;
78
using System.Threading;
89
using LLama.Exceptions;
910
using LLama.Native;
@@ -123,8 +124,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
123124
);
124125
}
125126

126-
// Check if this is the EOS token
127-
if (id == _weights.Tokens.EOS)
127+
// Check if this token should end generation
128+
if (_weights.Tokens.IsEndOfGeneration(id))
128129
break;
129130

130131
// Decode this token into text

LLama/Native/LLamaContextParams.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,16 @@ public bool offload_kqv
151151
}
152152
private sbyte _offload_kqv;
153153

154+
/// <summary>
155+
/// whether to use flash attention
156+
/// </summary>
157+
public bool flash_attention
158+
{
159+
readonly get => Convert.ToBoolean(_flash_attention);
160+
set => _flash_attention = Convert.ToSByte(value);
161+
}
162+
private sbyte _flash_attention;
163+
154164
//todo: implement abort callback support
155165
/// <summary>
156166
/// ggml_abort_callback

LLama/Native/LLamaFtype.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ public enum LLamaFtype
171171
/// </summary>
172172
LLAMA_FTYPE_MOSTLY_IQ1_M = 31,
173173

174+
/// <summary>
175+
/// except 1d tensors
176+
/// </summary>
177+
LLAMA_FTYPE_MOSTLY_BF16 = 32,
178+
174179
/// <summary>
175180
/// File type was not specified
176181
/// </summary>

LLama/Native/LLamaModelMetadataOverride.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ public unsafe struct LLamaModelMetadataOverride
4343
/// </summary>
4444
[FieldOffset(136)]
4545
public long BoolValue;
46+
47+
/// <summary>
48+
/// Value, **must** only be used if Tag == String
49+
/// </summary>
50+
[FieldOffset(136)]
51+
public fixed byte StringValue[128];
4652
}
4753

4854
/// <summary>
@@ -65,4 +71,9 @@ public enum LLamaModelKvOverrideType
6571
/// Overriding a bool value
6672
/// </summary>
6773
Bool = 2,
74+
75+
/// <summary>
76+
/// Overriding a string value
77+
/// </summary>
78+
String = 3,
6879
}

LLama/Native/LLamaModelParams.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,16 @@ public bool use_mlock
8181
}
8282
private sbyte _use_mlock;
8383

84+
/// <summary>
85+
/// validate model tensor data
86+
/// </summary>
87+
public bool check_tensors
88+
{
89+
readonly get => Convert.ToBoolean(_check_tensors);
90+
set => _check_tensors = Convert.ToSByte(value);
91+
}
92+
private sbyte _check_tensors;
93+
8494
/// <summary>
8595
/// Create a LLamaModelParams with default values
8696
/// </summary>

LLama/Native/LLamaModelQuantizeParams.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,16 @@ public bool pure
7070
}
7171
private sbyte _pure;
7272

73+
/// <summary>
74+
/// quantize to the same number of shards
75+
/// </summary>
76+
public bool keep_split
77+
{
78+
get => Convert.ToBoolean(_keep_split);
79+
set => _keep_split = Convert.ToSByte(value);
80+
}
81+
private sbyte _keep_split;
82+
7383
/// <summary>
7484
/// pointer to importance matrix data
7585
/// </summary>

LLama/Native/LLamaVocabPreType.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
namespace LLama.Native;
2+
3+
/// <summary>
4+
///
5+
/// </summary>
6+
/// <remarks>llama_vocab_pre_type</remarks>
7+
internal enum LLamaVocabPreType
8+
{
9+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
10+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
11+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
12+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
13+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
14+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
15+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
16+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
17+
}

LLama/Native/NativeApi.LLava.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ public static unsafe partial class NativeApi
1313
/// <param name="ctxClip">Llava Model</param>
1414
/// <returns>True if validate successfully</returns>
1515
[DllImport(llavaLibraryName, EntryPoint = "llava_validate_embed_size", CallingConvention = CallingConvention.Cdecl)]
16+
[return: MarshalAs(UnmanagedType.U1)]
1617
public static extern bool llava_validate_embed_size( SafeLLamaContextHandle ctxLlama, SafeLlavaModelHandle ctxClip);
1718

1819
/// <summary>
@@ -56,7 +57,7 @@ SafeLlavaImageEmbedHandle llava_image_embed_make_with_filename(SafeLlavaModelHan
5657
/// <param name="embed">Embedding handle</param>
5758
/// <returns>True on success</returns>
5859
[DllImport(llavaLibraryName, EntryPoint = "llava_eval_image_embed", CallingConvention = CallingConvention.Cdecl)]
59-
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed,
60-
int n_batch, ref int n_past);
60+
[return: MarshalAs(UnmanagedType.U1)]
61+
public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctx_llama, SafeLlavaImageEmbedHandle embed, int n_batch, ref int n_past);
6162

6263
}

LLama/Native/NativeApi.Sampling.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ public static void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, Span<
176176
public static extern LLamaToken llama_sample_token_greedy(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates);
177177

178178
/// <summary>
179-
/// Randomly selects a token from the candidates based on their probabilities.
179+
/// Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
180180
/// </summary>
181181
/// <param name="ctx"></param>
182182
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>

0 commit comments

Comments
 (0)