karpathy · karpathy · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 16, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -219,3 +219,18 @@ jobs:
 
       - name: Build project
         run: make -j4 -C dev/cuda
+
+  build-llama3:
+    runs-on: ubuntu-latest
+    container:
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Build FP32
+        run: PRECISION=FP32 make test_llama3cu train_llama3cu
+
+      - name: Build BF16
+        run: PRECISION=BF16 make test_llama3cu train_llama3cu
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -9,9 +9,10 @@ on:
   pull_request:
     branches:
       - master
+      - llama3
 
 jobs:
-  build-and-test-gpu:
+  build-and-test-gpt2:
     runs-on: ubicloud-gpu-standard-1-latest
 
     steps:
@@ -103,19 +104,98 @@ jobs:
           git clone https://github.com/NVIDIA/cudnn-frontend.git
 
       - name: Build with cuDNN
-        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu
 
       - name: Train model with cuDNN
         run: ./train_gpt2cu
 
-      - name: Train model fp32 with cuDNN
-        run: ./train_gpt2fp32cu
-
       - name: Execute testing program with cuDNN
         run: ./test_gpt2cu
 
-      - name: Execute testing program fp32 with cuDNN
-        run: ./test_gpt2fp32cu
+  build-and-test-llama3:
+    name: Build and test LLama3.2 1B
+    runs-on: ubicloud-gpu-standard-1-latest
+    env:
+      HF_TOKEN: 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
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - run: echo "::add-mask::$HF_TOKEN"
+
+      - name: Install OpenMP
+        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python dev/data/tinyshakespeare.py --model_desc llama-3
+
+      - name: Train model
+        # use the first 10 layers, so that everything fits into the 20GB of
+        # the A4000 Ada that we have in CI
+        run: python train_llama3.py --write_tensors 1 --dtype float32 --depth 10
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make test_llama3cu
+
+      - name: Run default
+        run: ./test_llama3cu
+
+      - name: Run no recompute GeLU
+        run: ./test_llama3cu -r 0
+
+      - name: Run recompute LN
+        run: ./test_llama3cu -r 2
+
+      - name: Build BF16 precision
+        run: PRECISION=BF16 make train_llama3cu test_llama3cu
+
+      - name: Run default (BF16)
+        run: ./test_llama3cu
+
+      - name: Run no recompute GeLU (BF16)
+        run: ./test_llama3cu -r 0
+
+      - name: Run no master weights (BF16)
+        run: ./test_llama3cu -w 0
+
+      - name: Run recompute LN (BF16)
+        run: ./test_llama3cu -r 2
+
+  build-and-test-llama3-untied:
+    name: Build and test LLama3.2 1B with untie weights
+    runs-on: ubicloud-gpu-standard-1-latest
+    env:
+      HF_TOKEN: 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
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - run: echo "::add-mask::$HF_TOKEN"
+
+      - name: Install OpenMP
+        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python dev/data/tinyshakespeare.py --model_desc llama-3
+
+      - name: Train model
+        run: python train_llama3.py --write_tensors 1 --dtype float32 --untie 1 --depth 10
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make test_llama3cu
+
+      - name: Run default
+        run: ./test_llama3cu
+
+      - name: Build BF16 precision
+        run: PRECISION=BF16 make train_llama3cu test_llama3cu
+
+      - name: Run default
+        run: ./test_llama3cu
 
   unit-tests-gpu:
     runs-on: ubicloud-gpu-standard-1-latest
@@ -126,3 +206,4 @@ jobs:
 
       - name: Test Device<->File IO
         run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
+
diff --git a/Makefile b/Makefile
@@ -243,8 +243,13 @@ else
   PFLAGS = -DENABLE_BF16
 endif
 
+# Optimizer precision settings, enable to allow BF16 for AdamW m/v state (also affects state file)
+ifeq ($(OPTIMIZER_LOW_PRECISION), 1)
+  PFLAGS += -DOPTIMIZER_LOW_PRECISION
+endif
+
 # PHONY means these targets will always be executed
-.PHONY: all train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu profile_gpt2cu
+.PHONY: all train_llama3cu test_llama3cu train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu profile_gpt2cu
 
 # Add targets
 TARGETS = train_gpt2 test_gpt2
@@ -285,6 +290,12 @@ test_gpt2fp32cu: test_gpt2_fp32.cu
 profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
 	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE)
 
+train_llama3cu: train_llama3.cu $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+
+test_llama3cu: test_llama3.cu $(NVCC_CUDNN)
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
+
 clean:
 	$(REMOVE_FILES) $(TARGETS)
 	$(REMOVE_BUILD_OBJECT_FILES)
diff --git a/dev/cbridge/README.md b/dev/cbridge/README.md
@@ -0,0 +1,13 @@
+# cbridge
+
+We'll use this directory for the PyTorch -> C bridge. So we have some PyTorch code and we'd like to have the equivalent C implementation (usually that one in turn serves as reference for the CUDA kernels later).
+
+For starters we have RoPE. E.g. generate the reference with PyTorch and then match it in C:
+
+```bash
+python rope.py
+gcc -o rope rope.c -lm
+./rope
+```
+
+The .py file writes a `robe.bin` file with the intermediate tensors.
diff --git a/dev/cbridge/rmsnorm.py b/dev/cbridge/rmsnorm.py
@@ -0,0 +1,101 @@
+"""
+An RMSNorm PyTorch reference implementation.
+This script then does forward/back and writes everything to file so we can
+develop the CPU version, and eventually the GPU kernel as well.
+"""
+
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+from torch.nn import functional as F
+
+# -----------------------------------------------------------------------------
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        mean_sq = x.pow(2).mean(dim=-1, keepdim=True) + self.eps
+        rstd = torch.rsqrt(mean_sq)
+        norm = x * rstd
+        return norm
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+def rmsnorm_backward(x, w, dout, eps):
+    # recompute the rstd, norm (or we could cache it in the forward pass)
+    mean_sq = x.pow(2).mean(dim=-1, keepdim=True) + eps  # (B, T, 1)
+    rstd = torch.rsqrt(mean_sq)  # (B, T, 1)
+    norm = x * rstd  # (B, T, C)
+    # gradients for weights
+    dw = (dout * norm).sum((0, 1))  # (C)
+    # gradients for input
+    dnorm = dout * w  # (B, T, C)
+    dx = dnorm - norm * (dnorm * norm).mean(dim=-1, keepdim=True)
+    dx *= rstd
+    return dx, dw
+
+# -----------------------------------------------------------------------------
+
+# seed the rng
+torch.manual_seed(42)
+
+B = 4
+T = 64
+C = 256
+eps = 1e-5
+
+inp = torch.randn(B, T, C, dtype=torch.float32)
+inp.requires_grad = True
+
+# rmsnorm
+m = RMSNorm(C, eps=eps)
+out = m(inp)
+
+# loss can just be a weighted sum, with some fixed weights
+wei = torch.randn_like(out, dtype=torch.float32)
+loss = (out * wei).sum()
+loss.backward()
+
+# let's now do the backward pass manually
+# backprop starts with the output gradient, which is exactly wei because of the loss functions
+dx, dw = rmsnorm_backward(inp, m.weight, wei, eps)
+# let's assert that the gradients match
+assert torch.allclose(dx, inp.grad, atol=1e-6)
+assert torch.allclose(dw, m.weight.grad, atol=1e-6)
+print("RMSNorm gradients match")
+print("first 5 elements of dx comparison:")
+print(dx.view(-1)[:5].tolist())
+print(inp.grad.view(-1)[:5].tolist())
+print("first 5 elements of dw comparison:")
+print(dw.view(-1)[:5].tolist())
+print(m.weight.grad.view(-1)[:5].tolist())
+print("dx error:", (inp.grad.view(-1) - dx.view(-1)).abs().max().item())
+print("dw error:", (m.weight.grad.view(-1) - dw.view(-1)).abs().max().item())
+
+# save to .bin file so we can check correctness in C land
+int_header = np.zeros(16, dtype=np.int32) # for ints
+float_header = np.zeros(16, dtype=np.float32) # for floats
+int_header[0] = 20240925 # magic number
+int_header[1] = B
+int_header[2] = T
+int_header[3] = C
+float_header[0] = eps
+
+# write the hyperparameters, inputs, output, and input gradients to file
+results_file = "rmsnorm.bin"
+with open(results_file, "wb") as f:
+    f.write(int_header.tobytes()) # 16 int32
+    f.write(float_header.tobytes()) # 16 float32
+    f.write(inp.detach().cpu().numpy().tobytes()) # B * T * C
+    f.write(out.detach().cpu().numpy().tobytes()) # B * T * C
+    f.write(wei.detach().cpu().numpy().tobytes()) # B * T * C
+    f.write(inp.grad.detach().cpu().numpy().tobytes()) # B * T * C
+    f.write(m.weight.grad.detach().cpu().numpy().tobytes()) # C
+print("Saved results to %s" % results_file)