Skip to content

Commit 02eedd9

Browse files
martindevansSignalRTm0nsky
authored
January 2025 Update (#1036)
* code changes for december update (not working yet) * Changes to support up to ggml-org/llama.cpp@d408bb9 * Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends * Updated to latest deps, fixed kernel memory failing to load * Copy missing Mac flibraries libggml-base and libggml-cpu * Removed any mention of AVX in MacOS loading * Added file copying for some more targets (still missing macos) * Updated to latest set of binaries * Fixed copy path for CUDA12 DLLs * Compatibility with llama.cpp backend split (PR #10256) on all platforms * Restore original comment * Update the dependency loader for ggml-metal and ggml-blas * Update the runtime targets for ggml-metal and ggml-blas * Add CPU backend (fallback) dependency for the GPU backends * Fix icons for the nuget backends * Update nuspec files for the GPU backends * Update BinaryReleaseId * Update nuspec for CPU & OSX * Update CPU nuspec to use noavx folder * Update Runtime.targets to use noavx folder * Update BinaryReleaseId * CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level --------- Co-authored-by: SignalRT <[email protected]> Co-authored-by: m0nsky <[email protected]>
1 parent f55252f commit 02eedd9

40 files changed

+563
-237
lines changed

LLama.Examples/Program.cs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
using LLama.Native;
1+
using LLama.Native;
22
using Spectre.Console;
3-
using System.Runtime.InteropServices;
43

54
AnsiConsole.MarkupLineInterpolated(
65
$"""
@@ -18,7 +17,7 @@ __ __ ____ __
1817
""");
1918

2019
// Configure logging. Change this to `true` to see log messages from llama.cpp
21-
var showLLamaCppLogs = false;
20+
var showLLamaCppLogs = true;
2221
NativeLibraryConfig
2322
.All
2423
.WithLogCallback((level, message) =>
@@ -31,8 +30,7 @@ __ __ ____ __
3130
NativeLibraryConfig
3231
.All
3332
.WithCuda()
34-
//.WithAutoDownload() // An experimental feature
35-
.DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
33+
.WithVulkan();
3634

3735
// Calling this method forces loading to occur now.
3836
NativeApi.llama_empty_call();

LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
3131

3232
var @params = new ModelParams(config.ModelPath)
3333
{
34-
ContextSize = config.ContextSize ?? 2048,
34+
ContextSize = config.ContextSize,
3535
GpuLayerCount = config.GpuLayerCount ?? 20,
3636
Embeddings = true,
37-
MainGpu = config.MainGpu,
38-
SplitMode = config.SplitMode,
3937
PoolingType = LLamaPoolingType.Mean,
4038
};
4139
_weights = LLamaWeights.LoadFromFile(@params);

LLama.KernelMemory/LlamaSharpTextGenerator.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
3333
{
3434
ContextSize = config.ContextSize ?? 2048,
3535
GpuLayerCount = config.GpuLayerCount ?? 20,
36-
MainGpu = config.MainGpu,
37-
SplitMode = config.SplitMode
3836
};
3937
_weights = LLamaWeights.LoadFromFile(parameters);
4038
_context = _weights.CreateContext(parameters);

LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,15 @@
1-
using LLama.Common;
21
using LLamaSharp.KernelMemory;
3-
using Microsoft.KernelMemory.AI;
4-
using System;
5-
using System.Collections.Generic;
6-
using System.Linq;
7-
using System.Text;
8-
using System.Text.RegularExpressions;
9-
using System.Threading.Tasks;
102
using Xunit.Abstractions;
113

124
namespace LLama.Unittest.KernelMemory
135
{
14-
public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
6+
public class LLamaSharpTextEmbeddingGeneratorTests
7+
: ITextTokenizerTests, IDisposable
158
{
169
private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
1710

18-
public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
11+
public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
12+
: base(testOutputHelper)
1913
{
2014
_embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
2115

LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,15 @@
1-
using LLama.Common;
21
using LLamaSharp.KernelMemory;
3-
using Microsoft.KernelMemory.AI;
4-
using System;
5-
using System.Collections.Generic;
6-
using System.Diagnostics;
7-
using System.Linq;
8-
using System.Reflection.Emit;
9-
using System.Text;
10-
using System.Text.RegularExpressions;
11-
using System.Threading.Tasks;
122
using Xunit.Abstractions;
13-
using Xunit.Sdk;
14-
using static System.Net.Mime.MediaTypeNames;
153

164
namespace LLama.Unittest.KernelMemory
175
{
18-
public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
6+
public class LlamaSharpTextGeneratorTests
7+
: ITextTokenizerTests, IDisposable
198
{
209
private readonly LlamaSharpTextGenerator _textGenerator;
2110

22-
public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
11+
public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
12+
: base(testOutputHelper)
2313
{
2414
_textGenerator = new LlamaSharpTextGenerator(_lsConfig);
2515

LLama.Unittest/SamplingTests.cs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
167167
var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());
168168

169169
chain.AddPenalties(
170-
vocabSize: context.VocabCount,
171-
eos: context.ModelHandle.Tokens.EOS,
172-
newline: context.ModelHandle.Tokens.Newline ?? 0,
173-
penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
174-
penalizeNewline: false, ignoreEOS: false
170+
penaltyCount: 60, repeat: 1, freq: 0, presence: 0
175171
);
176172

177173
if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }

LLama.Web/Common/ModelOptions.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class ModelOptions
2424
public int MainGpu { get; set; } = 0;
2525

2626
/// <inheritdoc />
27-
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
27+
public GPUSplitMode? SplitMode { get; set; }
2828

2929
/// <inheritdoc />
3030
public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
5959
/// <inheritdoc />
6060
public TensorSplitsCollection TensorSplits { get; set; } = new();
6161

62+
/// <inheritdoc />
63+
public bool CheckTensors { get; }
64+
6265
/// <inheritdoc />
6366
public List<MetadataOverride> MetadataOverrides { get; } = new();
6467

LLama/Abstractions/IModelParams.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public interface IModelParams
3636
/// <summary>
3737
/// How to split the model across multiple GPUs
3838
/// </summary>
39-
GPUSplitMode SplitMode { get; }
39+
GPUSplitMode? SplitMode { get; }
4040

4141
/// <summary>
4242
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
6868
/// </summary>
6969
bool VocabOnly { get; }
7070

71+
/// <summary>
72+
/// Validate model tensor data before loading
73+
/// </summary>
74+
bool CheckTensors { get; }
75+
7176
/// <summary>
7277
/// Override specific metadata items in the model
7378
/// </summary>

LLama/Common/ModelParams.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public record ModelParams
1919
public int MainGpu { get; set; } = 0;
2020

2121
/// <inheritdoc />
22-
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
22+
public GPUSplitMode? SplitMode { get; set; }
2323

2424
/// <inheritdoc />
2525
public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
5454
/// <inheritdoc />
5555
public TensorSplitsCollection TensorSplits { get; set; } = new();
5656

57+
/// <inheritdoc />
58+
public bool CheckTensors { get; }
59+
5760
/// <inheritdoc />
5861
public List<MetadataOverride> MetadataOverrides { get; set; } = new();
5962

LLama/Extensions/IModelParamsExtensions.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using System.IO;
1+
using System.IO;
22
using System;
33
using System.Text;
44
using LLama.Abstractions;
@@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
3131
result = LLamaModelParams.Default();
3232

3333
result.main_gpu = @params.MainGpu;
34-
result.split_mode = @params.SplitMode;
3534
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
35+
if (@params.SplitMode.HasValue)
36+
result.split_mode = @params.SplitMode.Value;
37+
3638
result.use_mlock = @params.UseMemoryLock;
3739
result.use_mmap = @params.UseMemorymap;
3840
result.vocab_only = @params.VocabOnly;
41+
result.check_tensors = @params.CheckTensors;
3942

4043
unsafe
4144
{

LLama/Extensions/LLamaExecutorExtensions.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
147147
PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
148148
PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
149149
RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
150-
RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
150+
PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
151151
Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
152152
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
153153
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,

LLama/LLamaQuantizer.cs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
106106
case LLamaFtype.MOSTLY_IQ3_S:
107107
case LLamaFtype.MOSTLY_IQ3_M:
108108

109-
case LLamaFtype.MOSTLY_Q4_0_4_4:
110-
case LLamaFtype.MOSTLY_Q4_0_4_8:
111-
case LLamaFtype.MOSTLY_Q4_0_8_8:
112109
return true;
113110

114111
case LLamaFtype.GUESSED:

0 commit comments

Comments
 (0)