Lightning-AI
diff --git a/‎.azure/gpu-test.yml
Lines changed: 31 additions & 28 deletions b/‎.azure/gpu-test.yml
Lines changed: 31 additions & 28 deletions
diff --git a/‎.devcontainer/Dockerfile
Lines changed: 9 additions & 0 deletions b/‎.devcontainer/Dockerfile
Lines changed: 9 additions & 0 deletions
diff --git a/‎.devcontainer/devcontainer.json
Lines changed: 105 additions & 0 deletions b/‎.devcontainer/devcontainer.json
Lines changed: 105 additions & 0 deletions
diff --git a/‎.github/workflows/cpu-tests.yml
Lines changed: 22 additions & 6 deletions b/‎.github/workflows/cpu-tests.yml
Lines changed: 22 additions & 6 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 2 deletions b/‎README.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎extensions/thunder/pretrain.py
Lines changed: 9 additions & 2 deletions b/‎extensions/thunder/pretrain.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎extensions/xla/generate/adapter.py
Lines changed: 5 additions & 2 deletions b/‎extensions/xla/generate/adapter.py
Lines changed: 5 additions & 2 deletions
@@ -17,10 +17,10 @@ jobs:
     strategy:
       matrix:
         "ordinary":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3"
           dependency: ""
         "w. Thunder":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+          #image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.7-cuda12.6.3"
           dependency: "compiler"
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
@@ -29,8 +29,14 @@ jobs:
       HF_HOME: "/var/tmp/hf/home"
       HF_HUB_CACHE: "/var/tmp/hf/hub"
       CI: "true"
+      PYTHON_VERSION: "3.10"
+      CUDA_VERSION: "12.6.3"
+      TORCH_VERSION: "2.7.0"
+      CUDNN_FRONTEND_VERSION: "1.10.0"
     container:
-      image: $(image)
+      # image: "pytorchlightning/pytorch_lightning:base-cuda-py$(PYTHON_VERSION)-torch$(TORCH_VERSION)-cuda$(CUDA_VERSION)"
+      # pytorchlightning/lightning-thunder:ubuntu22.04-cuda12.1.1-cudnn-fe1.5.0-py3.10-pt_main-dev
+      image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_$(TORCH_VERSION)-dev"
       options: "--gpus=all --shm-size=8gb -v /var/tmp:/var/tmp"
     workspace:
       clean: all
@@ -55,54 +61,51 @@ jobs:
 
       - script: |
           pip install --upgrade pip
-          pip install '.[extra,test]' -U
-        displayName: "Install dependencies"
+          pip install '.[extra,test]' cffi -U
+        displayName: "Install package & dependencies"
 
       - script: |
           set -e
           pip uninstall -y torchvision torchaudio
-          pip install --pre 'nvfuser-cu121[torch]' --extra-index-url https://pypi.nvidia.com
           pip install '.[compiler]'
           python -c "from thunder.executors import nvfuser_available ; assert nvfuser_available(), 'nvFuser is missing!'"
           python -c "from thunder.executors.triton_utils import triton_version ; assert triton_version() is not None, 'triton is missing!'"
         condition: eq(variables['dependency'], 'compiler')
-        displayName: "Install nvFuser & Thunder"
+        displayName: "Install `compiler` [nvFuser & Thunder]"
 
       - bash: |
           set -e
           pip list
           python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+          python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$(TORCH_VERSION)', f'PyTorch: installed {ver} but expected $(TORCH_VERSION)'"
         displayName: "Env details"
 
-      - bash: |
-          pytest -v \
-            --ignore-glob="tests/test_thunder*" \
-            --ignore="tests/test_unsloth_executor.py"
-        displayName: "Ordinary tests"
-        condition: ne(variables['dependency'], 'compiler')
-        timeoutInMinutes: "5"
+      - bash: pytest -v
+        displayName: "All tests"
+        #condition: eq(variables['dependency'], 'compiler')
+        timeoutInMinutes: "15"
 
       - bash: |
-          # install thunder from source, so that, thunder.tests will be available
-          pip install -U "thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git"
-          PL_RUN_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v # without env var, it filters out all tests
-        displayName: "Extra tests w. Thunder [main branch]"
-        condition: eq(variables['dependency'], 'compiler')
+          wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
+          bash run_standalone_tests.sh "tests"
+        displayName: "Standalone tests"
         env:
-          PL_RUN_CUDA_TESTS: "0"
+          PL_RUN_STANDALONE_TESTS: "1"
+          # NUM_PARALLEL_TESTS: "10"
         timeoutInMinutes: "10"
 
       - bash: |
-          pytest -v
-        displayName: "All tests"
+          pip uninstall -y lightning-thunder
+          # install thunder from source, so that, thunder.tests will be available
+          pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git"
+        displayName: "Re-install Thunder [main branch]"
         condition: eq(variables['dependency'], 'compiler')
-        timeoutInMinutes: "5"
 
       - bash: |
-          wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
-          bash run_standalone_tests.sh "tests"
-        displayName: "Standalone tests"
+          # without env var, it filters out all tests
+          PL_RUN_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v
+        displayName: "Extra tests for Thunder [main branch]"
+        condition: eq(variables['dependency'], 'compiler')
         env:
-          PL_RUN_STANDALONE_TESTS: "1"
-          # NUM_PARALLEL_TESTS: "10"
+          TORCHDYNAMO_VERBOSE: "1"
         timeoutInMinutes: "10"
@@ -0,0 +1,9 @@
+# See here for image contents: https://github.com/devcontainers/images/blob/main/src/python/.devcontainer/Dockerfile
+
+# [Choice] Python version (use -bookworm or -bullseye variants on local arm64/Apple Silicon): 3, 3.12, 3.11, 3.10, 3.9, 3.8, 3-bookworm, 3.12-bookworm, 3.11-bookworm, 3.10-bookworm, 3.9-bookworm, 3.8-bookworm, 3-bullseye, 3.12-bullseye, 3.11-bullseye, 3.10-bullseye, 3.9-bullseye, 3.8-bullseye, 3-buster, 3.12-buster, 3.11-buster, 3.10-buster, 3.9-buster, 3.8-buster
+ARG VARIANT=3-bookworm
+FROM mcr.microsoft.com/devcontainers/python:1-${VARIANT}
+
+# Temporary: Upgrade python packages due to https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-40897
+# They are installed by the base image (python) which does not have the patch.
+RUN python3 -m pip install --upgrade pip setuptools
@@ -0,0 +1,105 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
+// https://github.com/microsoft/vscode-dev-containers/tree/v0.194.0/containers/python-3
+{
+  "name": "Python 3 (litgpt)",
+  "build": {
+    "dockerfile": "Dockerfile",
+    "context": "..",
+    "args": {
+      "VARIANT": "3.11-bookworm"
+    }
+  },
+  "runArgs": [
+    // Enable GPU passthrough, requires WSL2 on Windows
+    //"--gpus=all",
+    // One of the following options is required for torch multiprocessing
+    //"--ipc=host",
+    //"--shm-size=4gb",
+  ],
+  // Features to add to the dev container. More info: https://containers.dev/features.
+  "features": {
+    "ghcr.io/devcontainers/features/git:1": {},
+    "ghcr.io/devcontainers/features/git-lfs:1": {},
+    //"ghcr.io/devcontainers/features/nvidia-cuda:1": {},
+    "ghcr.io/devcontainers-extra/features/actionlint:1": {},
+    "ghcr.io/devcontainers-extra/features/pre-commit:2": {},
+    "ghcr.io/dhoeric/features/act:1": {},
+    "ghcr.io/devcontainers/features/docker-in-docker:2": {
+      "version": "latest",
+      "moby": true
+    }
+  },
+  // Set *default* container specific settings.json values on container create.
+  "customizations": {
+    "vscode": {
+      "settings": {
+        "editor.tabSize": 4,
+        "editor.renderWhitespace": "all",
+        "editor.formatOnSave": true,
+        "editor.rulers": [120],
+        "files.exclude": {
+          "**/__pycache__": true
+        },
+        "python.pythonPath": "/usr/local/bin/python",
+        "python.defaultInterpreterPath": "/usr/local/bin/python",
+        "python.languageServer": "Pylance",
+        "python.analysis.autoImportCompletions": true,
+        "python.analysis.completeFunctionParens": true,
+        "python.analysis.autoSearchPaths": true,
+        "python.testing.pytestArgs": ["tests"],
+        "python.testing.unittestEnabled": false,
+        "python.testing.pytestEnabled": true,
+        "code-eol.highlightNonDefault": true,
+        "code-eol.highlightExtraWhitespace": true,
+        "autoDocstring.docstringFormat": "google-notypes",
+        "autoDocstring.guessTypes": true,
+        "autoDocstring.generateDocstringOnEnter": true,
+        "autoDocstring.startOnNewLine": true,
+        "telemetry.telemetryLevel": "off",
+        "[python]": {
+          "editor.formatOnSave": true,
+          "editor.defaultFormatter": "charliermarsh.ruff",
+          "editor.codeActionsOnSave": {
+            "source.organizeImports": "always",
+            "source.fixAll": "always"
+          }
+        }
+      },
+      // Add the IDs of extensions you want installed when the container is created.
+      "extensions": [
+        "ms-python.python",
+        "ms-python.vscode-pylance",
+        "ms-toolsai.jupyter",
+        "GitHub.copilot",
+        "GitHub.copilot-chat",
+        "github.vscode-github-actions",
+        "SanjulaGanepola.github-local-actions",
+        "charliermarsh.ruff",
+        "esbenp.prettier-vscode",
+        "ms-vscode.test-adapter-converter",
+        "njqdev.vscode-python-typehint",
+        "KevinRose.vsc-python-indent",
+        "medo64.render-crlf",
+        "shardulm94.trailing-spaces",
+        "nhoizey.gremlins",
+        "wayou.vscode-todo-highlight",
+        "Gruntfuggly.todo-tree",
+        "njpwerner.autodocstring",
+        "rodolphebarbanneau.python-docstring-highlighter",
+        "mechatroner.rainbow-csv",
+        "uctakeoff.vscode-counter",
+        "bierner.github-markdown-preview",
+        "yahyabatulu.vscode-markdown-alert",
+        "ms-vscode-remote.vscode-remote-extensionpack",
+        "ms-azuretools.vscode-docker",
+        "redhat.vscode-yaml"
+      ]
+    }
+  },
+  // Use 'forwardPorts' to make a list of ports inside the container available locally.
+  // "forwardPorts": [],
+  // Use 'postCreateCommand' to run commands after the container is created.
+  "postCreateCommand": "pre-commit install && pip install '.[extra,compiler,test]' -U",
+  // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
+  "remoteUser": "vscode"
+}
@@ -29,13 +29,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ["ubuntu-22.04", "macOS-14", "windows-2022"]
+        os: ["ubuntu-22.04", "ubuntu-24.04", "macOS-14", "windows-2022"]
         python-version: ["3.10"]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout generic
+        uses: actions/checkout@v4
         if: github.event_name != 'pull_request_target'
-      - uses: actions/checkout@v4
+      - name: Checkout for `pull_request_target`
+        uses: actions/checkout@v4
         if: github.event_name == 'pull_request_target'
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -60,6 +62,13 @@ jobs:
           python -c "$modules"
 
   pytester:
+    # skip PR trigger if secrets are not shared as for all forked PRs
+    if: |
+      github.event_name != 'pull_request' ||
+      (
+        github.event_name == 'pull_request' &&
+        contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
+      )
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -71,9 +80,11 @@ jobs:
           - { os: "windows-2022", python-version: "3.9" }
     timeout-minutes: 25
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout generic
+        uses: actions/checkout@v4
         if: github.event_name != 'pull_request_target'
-      - uses: actions/checkout@v4
+      - name: Checkout for `pull_request_target`
+        uses: actions/checkout@v4
         if: github.event_name == 'pull_request_target'
         with:
           ref: ${{ github.event.pull_request.head.sha }}
@@ -113,7 +124,12 @@ jobs:
   testing-guardian:
     runs-on: ubuntu-latest
     needs: [pytester, testing-imports]
-    if: always()
+    if: |
+      github.event_name == 'pull_request_target' ||
+      (
+        github.event_name == 'pull_request' &&
+        contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association)
+      )
     steps:
       - run: echo "${{ needs.pytester.result }}"
       - name: failing...
 
@@ -8,7 +8,7 @@
 <pre>
 ✅ From scratch implementations      ✅ No abstractions         ✅ Beginner friendly
    ✅ Flash attention                   ✅ FSDP                    ✅ LoRA, QLoRA, Adapter
-✅ Reduce GPU memory (fp4/8/16/32)   ✅ 1-1000+ GPUs/TPUs       ✅ 20+ LLMs
+✅ Reduce GPU memory (fp4/8/16/32)   ✅ 1-1000+ GPUs/TPUs       ✅ 20+ LLMs         
 </pre>
 
 
@@ -53,7 +53,7 @@ Every LLM is implemented from scratch with **no abstractions** and **full contro
 # Quick start
 Install LitGPT
 ```
-pip install 'litgpt[all]'
+pip install 'litgpt[extra]'
 ```
 
 Load and use any of the [20+ LLMs](#choose-from-20-llms):
@@ -139,13 +139,17 @@ Every model is written from scratch to maximize performance and remove layers of
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |
 | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905)                                                                            |
 | Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743)                                           |
+| Phi 4 Mini Reasoning | 3.8B | Microsoft Research | [Xu, Peng et al. 2025](https://arxiv.org/abs/2504.21233)                                           |
+| Phi 4 Reasoning | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
+| Phi 4 Reasoning Plus | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
+| Qwen3 | 0.6B, 1.7B, 4B, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/)                                                                         |
 | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |
 | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra)                                                                         |
 
@@ -5,6 +5,7 @@
 import pprint
 import sys
 import time
+from dataclasses import asdict
 from datetime import timedelta
 from functools import partial
 from pathlib import Path
@@ -20,7 +21,7 @@
 from typing_extensions import Literal
 
 from litgpt import Tokenizer
-from litgpt.args import EvalArgs, TrainArgs
+from litgpt.args import EvalArgs, LogArgs, TrainArgs
 from litgpt.data import DataModule, TinyLlama
 from litgpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP
 from litgpt.utils import (
@@ -70,6 +71,7 @@ def setup(
         tie_embeddings=False,
     ),
     eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
+    log: LogArgs = LogArgs(),
     optimizer: Union[str, Dict] = "AdamW",
     devices: Union[int, str] = "auto",
     num_nodes: int = 1,
@@ -121,7 +123,12 @@ def setup(
     tokenizer = Tokenizer(tokenizer_dir) if tokenizer_dir is not None else None
 
     logger = choose_logger(
-        logger_name, out_dir, name=f"pretrain-{config.name}", resume=bool(resume), log_interval=train.log_interval
+        logger_name,
+        out_dir,
+        name=f"pretrain-{config.name}",
+        resume=bool(resume),
+        log_interval=train.log_interval,
+        log_args=asdict(log),
     )
 
     if devices * num_nodes > 1:
 
@@ -26,6 +26,7 @@ def setup(
     prompt: str = "What food do llamas eat?",
     *,
     input: str = "",
+    sys_prompt: Optional[str] = None,
     adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/tiiuae/falcon-7b"),
     max_new_tokens: int = 100,
@@ -40,6 +41,7 @@ def setup(
     Args:
         prompt: The prompt/instruction (Alpaca style).
         input: Optional input (Alpaca style).
+        sys_prompt: Optional system prompt.
         adapter_path: Path to the checkpoint with trained adapter weights, which are the output of
             `xla/finetune/adapter.py`.
         checkpoint_dir: The path to the checkpoint folder with pretrained model weights.
@@ -52,13 +54,14 @@ def setup(
     devices = XLAAccelerator.auto_device_count()
     strategy = XLAFSDPStrategy(auto_wrap_policy={Block}) if devices > 1 else "auto"
     fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy)
-    fabric.launch(main, prompt, input, adapter_path, checkpoint_dir, max_new_tokens, top_k, temperature)
+    fabric.launch(main, prompt, input, sys_prompt, adapter_path, checkpoint_dir, max_new_tokens, top_k, temperature)
 
 
 def main(
     fabric: L.Fabric,
     prompt: str,
     input: str,
+    sys_prompt: Optional[str],
     adapter_path: Path,
     checkpoint_dir: Path,
     max_new_tokens: int,
@@ -90,7 +93,7 @@ def main(
     tokenizer = Tokenizer(checkpoint_dir)
     # TODO: Load prompt style from checkpoint and apply it here
     prompt_style = Alpaca()
-    prompt = prompt_style.apply(prompt, input=input)
+    prompt = prompt_style.apply(prompt, sys_prompt=sys_prompt, input=input)
     encoded = tokenizer.encode(prompt, device=fabric.device)
     prompt_length = encoded.size(0)
     max_returned_tokens = prompt_length + max_new_tokens