style: apply black and isort formatting

AirRunner · AirRunner · commit fb12ea6073a7 · 2026-03-13T01:05:55.000+01:00
diff --git a/mlx_lm/generate.py b/mlx_lm/generate.py
@@ -3,10 +3,10 @@
 import argparse
 import contextlib
 import functools
-import warnings
 import json
 import sys
 import time
+import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import (
@@ -710,7 +710,9 @@ def _process_and_sample(tokens, logits):
     def _step_backbone(y, n_predict=1, n_confirmed=0):
         """Run the backbone on ``y`` and return (tokens, logprobs, hidden)."""
         with mx.stream(generation_stream):
-            logits, hidden = model(y[None], cache=model_cache, return_hidden=True, n_confirmed=n_confirmed)
+            logits, hidden = model(
+                y[None], cache=model_cache, return_hidden=True, n_confirmed=n_confirmed
+            )
             logits = logits[:, -n_predict:, :]
             quantize_cache_fn(model_cache)
             nonlocal prev_tokens
@@ -778,11 +780,13 @@ def _prefill(y):
                 y_with_draft = mx.concatenate(
                     [y, mx.array([draft_tok.item()], mx.uint32)]
                 )
-                toks, lps, hidden = _step_backbone(y_with_draft, n_predict=2, n_confirmed=1)
+                toks, lps, hidden = _step_backbone(
+                    y_with_draft, n_predict=2, n_confirmed=1
+                )
                 mx.eval(toks, draft_tok)
 
-                verify_pred = toks[0]   # backbone prediction after y → verify draft
-                bonus_tok = toks[1]     # backbone prediction after draft_tok
+                verify_pred = toks[0]  # backbone prediction after y → verify draft
+                bonus_tok = toks[1]  # backbone prediction after draft_tok
                 verify_lp = lps[0]
                 bonus_lp = lps[1]
 
@@ -812,7 +816,10 @@ def _prefill(y):
                     # by GatedDeltaNet after the confirmed token.
                     # Attention layers (KVCache): trim the draft-token entry.
                     for c in model_cache:
-                        if hasattr(c, "rollback_state") and c.rollback_state is not None:
+                        if (
+                            hasattr(c, "rollback_state")
+                            and c.rollback_state is not None
+                        ):
                             conv_snap, ssm_snap = c.rollback_state
                             c[0] = conv_snap
                             c[1] = ssm_snap
diff --git a/mlx_lm/models/qwen3_5.py b/mlx_lm/models/qwen3_5.py
@@ -159,8 +159,15 @@ def _process_chunk(
         k = inv_scale * mx.fast.rms_norm(k, None, 1e-6)
 
         out, new_ssm_state = gated_delta_update(
-            q, k, v, a_chunk, b_chunk,
-            self.A_log, self.dt_bias, ssm_state, ssm_mask,
+            q,
+            k,
+            v,
+            a_chunk,
+            b_chunk,
+            self.A_log,
+            self.dt_bias,
+            ssm_state,
+            ssm_mask,
             use_kernel=not self.training,
         )
         return out, new_conv_state, new_ssm_state
@@ -185,7 +192,9 @@ def __call__(
         conv_state = (
             cache[0]
             if cache is not None and cache[0] is not None
-            else mx.zeros((B, self.conv_kernel_size - 1, self.conv_dim), dtype=inputs.dtype)
+            else mx.zeros(
+                (B, self.conv_kernel_size - 1, self.conv_dim), dtype=inputs.dtype
+            )
         )
         ssm_state = cache[1] if cache else None
 
@@ -198,18 +207,28 @@ def __call__(
             mask_c = mask[:, :n_confirmed] if mask is not None else None
             mask_d = mask[:, n_confirmed:] if mask is not None else None
             out_c, conv_c, ssm_c = self._process_chunk(
-                qkv[:, :n_confirmed], a[:, :n_confirmed], b[:, :n_confirmed],
-                conv_state, ssm_state, mask_c,
+                qkv[:, :n_confirmed],
+                a[:, :n_confirmed],
+                b[:, :n_confirmed],
+                conv_state,
+                ssm_state,
+                mask_c,
             )
             if cache is not None:
                 cache.rollback_state = (conv_c, ssm_c)
             out_d, conv_f, ssm_f = self._process_chunk(
-                qkv[:, n_confirmed:], a[:, n_confirmed:], b[:, n_confirmed:],
-                conv_c, ssm_c, mask_d,
+                qkv[:, n_confirmed:],
+                a[:, n_confirmed:],
+                b[:, n_confirmed:],
+                conv_c,
+                ssm_c,
+                mask_d,
             )
             out = mx.concatenate([out_c, out_d], axis=1)
         else:
-            out, conv_f, ssm_f = self._process_chunk(qkv, a, b, conv_state, ssm_state, mask)
+            out, conv_f, ssm_f = self._process_chunk(
+                qkv, a, b, conv_state, ssm_state, mask
+            )
 
         if cache is not None:
             cache[0] = conv_f
@@ -251,7 +270,9 @@ def __call__(
         n_confirmed: int = 0,
     ) -> mx.array:
         if self.is_linear:
-            r = self.linear_attn(self.input_layernorm(x), mask, cache, n_confirmed=n_confirmed)
+            r = self.linear_attn(
+                self.input_layernorm(x), mask, cache, n_confirmed=n_confirmed
+            )
         else:
             r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -266,7 +287,9 @@ def __init__(self, args: TextModelArgs):
         super().__init__()
         self.self_attn = Attention(args)
         self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
-        self.post_attention_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            args.hidden_size, eps=args.rms_norm_eps
+        )
         if args.num_experts > 0:
             self.mlp = SparseMoeBlock(args)
         else:
@@ -295,9 +318,7 @@ def __init__(self, args: TextModelArgs):
         self.pre_fc_norm_hidden = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
         self.pre_fc_norm_embedding = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
         self.fc = nn.Linear(args.hidden_size * 2, args.hidden_size, bias=False)
-        self.layers = [
-            MTPDecoderLayer(args) for _ in range(args.mtp_num_hidden_layers)
-        ]
+        self.layers = [MTPDecoderLayer(args) for _ in range(args.mtp_num_hidden_layers)]
         self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
 
     def __call__(
@@ -355,7 +376,11 @@ def __call__(
 
         for layer, c in zip(self.layers, cache):
             mask = ssm_mask if layer.is_linear else fa_mask
-            kw = {"n_confirmed": n_confirmed} if layer.is_linear and n_confirmed > 0 else {}
+            kw = (
+                {"n_confirmed": n_confirmed}
+                if layer.is_linear and n_confirmed > 0
+                else {}
+            )
             hidden_states = layer(hidden_states, mask=mask, cache=c, **kw)
 
         return hidden_states
@@ -380,7 +405,9 @@ def __call__(
         return_hidden: bool = False,
         n_confirmed: int = 0,
     ) -> mx.array:
-        hidden = self.model(inputs, cache, input_embeddings=input_embeddings, n_confirmed=n_confirmed)
+        hidden = self.model(
+            inputs, cache, input_embeddings=input_embeddings, n_confirmed=n_confirmed
+        )
         normed = self.model.norm(hidden)
         if self.args.tie_word_embeddings:
             out = self.model.embed_tokens.as_linear(normed)
diff --git a/tests/test_mtp.py b/tests/test_mtp.py
@@ -2,8 +2,9 @@
 import unittest
 
 import mlx.core as mx
-from mlx_lm.models.cache import make_prompt_cache
+
 from mlx_lm.generate import generate_step, mtp_generate_step
+from mlx_lm.models.cache import make_prompt_cache
 
 
 def _make_qwen3_5_mtp_model():