add TailFreeSampling_Z, add comment about currently unsupported ollama settings

ccreutzi · ccreutzi · commit e2299352f88f · 2024-06-05T10:13:25.000+01:00
diff --git a/+llms/+internal/callOllamaChatAPI.m b/+llms/+internal/callOllamaChatAPI.m
@@ -44,6 +44,7 @@
     nvp.Temperature = 1
     nvp.TopProbabilityMass = 1
     nvp.TopProbabilityNum = Inf
+    nvp.TailFreeSamplingZ = 1
     nvp.NumCompletions = 1
     nvp.StopSequences = []
     nvp.MaxNumTokens = inf
@@ -116,6 +117,7 @@
 dict("Temperature") = "temperature";
 dict("TopProbabilityMass") = "top_p";
 dict("TopProbabilityNum") = "top_k";
+dict("TailFreeSamplingZ") = "tfs_z";
 dict("NumCompletions") = "n";
 dict("StopSequences") = "stop";
 dict("MaxNumTokens") = "num_predict";
diff --git a/ollamaChat.m b/ollamaChat.m
@@ -31,13 +31,8 @@
 %   ResponseFormat          - The format of response the model returns.
 %                             "text" (default) | "json"
 %
-%   Mirostat - 0/1/2, eta, tau
-%
-%   RepeatLastN - find a better name! “Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)”
-%
-%   RepeatPenalty
-%
-%   TailFreeSamplingZ
+%   TailFreeSamplingZ       - Reduce the use of less probable tokens, based on
+%                             the second-order differences of ordered probabilities.
 %
 %   StreamFun               - Function to callback when streaming the
 %                             result
@@ -50,29 +45,22 @@
 %       ollamaChat            - Chat completion API from OpenAI.
 %       generate             - Generate a response using the ollamaChat instance.
 %
-%   ollamaChat Properties:
+%   ollamaChat Properties, in addition to the name-value pairs above:
 %       Model                - Model name (as expected by ollama server)
 %
-%       Temperature          - Temperature of generation.
-%
-%       TopProbabilityMass   - Top probability mass to consider for generation (top-p sampling).
-%
-%       TopProbabilityNum    - Only consider the k most likely tokens for generation (top-k sampling).
-%
-%       StopSequences        - Sequences to stop the generation of tokens.
-%
 %       SystemPrompt         - System prompt.
-%
-%       ResponseFormat       - Specifies the response format, text or json
-%
-%       TimeOut              - Connection Timeout in seconds (default: 120 secs)
-%
+
+% Ollama model properties not exposed:
+%  repeat_last_n, repeat_penalty           - could not find an example where they made a difference
+%  mirostat, mirostat_eta, mirostat_tau    - looking for the best API design
+
 
 % Copyright 2024 The MathWorks, Inc.
 
     properties
         Model     (1,1) string
         TopProbabilityNum (1,1) {mustBeReal,mustBePositive} = Inf
+        TailFreeSamplingZ (1,1) {mustBeReal} = 1
     end
 
     methods
@@ -86,6 +74,7 @@
                 nvp.StopSequences                  {llms.utils.mustBeValidStop} = {}
                 nvp.ResponseFormat           (1,1) string {mustBeMember(nvp.ResponseFormat,["text","json"])} = "text"
                 nvp.TimeOut                  (1,1) {mustBeReal,mustBePositive} = 120
+                nvp.TailFreeSamplingZ        (1,1) {mustBeReal} = 1
                 nvp.StreamFun                (1,1) {mustBeA(nvp.StreamFun,'function_handle')}
             end
 
@@ -107,6 +96,7 @@
             this.Temperature = nvp.Temperature;
             this.TopProbabilityMass = nvp.TopProbabilityMass;
             this.TopProbabilityNum = nvp.TopProbabilityNum;
+            this.TailFreeSamplingZ = nvp.TailFreeSamplingZ;
             this.StopSequences = nvp.StopSequences;
             this.TimeOut = nvp.TimeOut;
         end
@@ -131,10 +121,6 @@
             %
             %       Seed             - An integer value to use to obtain
             %                          reproducible responses
-            %
-            % Currently, GPT-4 Turbo with vision does not support the message.name
-            % parameter, functions/tools, response_format parameter, stop
-            % sequences, and max_tokens
 
             arguments
                 this                    (1,1) ollamaChat
@@ -158,6 +144,7 @@
                 this.Model, messagesStruct, ...
                 Temperature=this.Temperature, ...
                 TopProbabilityMass=this.TopProbabilityMass, TopProbabilityNum=this.TopProbabilityNum,...
+                TailFreeSamplingZ=this.TailFreeSamplingZ,...
                 NumCompletions=nvp.NumCompletions,...
                 StopSequences=this.StopSequences, MaxNumTokens=nvp.MaxNumTokens, ...
                 ResponseFormat=this.ResponseFormat,Seed=nvp.Seed, ...
diff --git a/tests/tollamaChat.m b/tests/tollamaChat.m
@@ -48,6 +48,16 @@ function extremeTopK(testCase)
             testCase.verifyEqual(response1,response2);
         end
 
+        function extremeTfsZ(testCase)
+            % setting tfs_z to z=0 leaves no random choice,
+            % so we expect to get a fixed response.
+            chat = ollamaChat("mistral",TailFreeSamplingZ=0);
+            prompt = "Sampling with tfs_z=0 returns a definite answer.";
+            response1 = generate(chat,prompt);
+            response2 = generate(chat,prompt);
+            testCase.verifyEqual(response1,response2);
+        end
+
         function stopSequences(testCase)
             chat = ollamaChat("mistral",TopProbabilityNum=1);
             prompt = "Top-k sampling with k=1 returns a definite answer.";