From 987139fad49867f92035ac01f929ba0c9ac5fc03 Mon Sep 17 00:00:00 2001
From: Krish Vaswani <krishvaswani@yahoo.com>
Date: Tue, 21 Oct 2025 14:06:15 +0530
Subject: [PATCH 1/2] Fixed some numerical mistakes in the base code

---
 llm_tools/utils/memory_utils.py | 68 +++++++++++++++++++++++++--------
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/llm_tools/utils/memory_utils.py b/llm_tools/utils/memory_utils.py
index 3d50a9b..f14959c 100644
--- a/llm_tools/utils/memory_utils.py
+++ b/llm_tools/utils/memory_utils.py
@@ -64,18 +64,57 @@ def get_kv_cache(
 
 @st.cache_data
 def get_activation_memory(
-    batch_size, sequence_length, hidden_size, num_attention_heads
+    batch_size, sequence_length, hidden_size, num_attention_heads, precision,
+    num_hidden_layers, mlp_multiple=4
 ):
-    """Calculate the memory required for activations."""
-    precision = "float32"
+    """
+    Calculate the memory required for activations. It references this paper:
+    https://proceedings.mlsys.org/paper_files/paper/2023/file/80083951326cf5b35e5100260d64ed81-Paper-mlsys2023.pdf
+
+    Let 
+    s = sequence length
+    b = batch size
+    h = hidden size
+    a = attention heads
+    L = number of transformer layers
+
+    Then, the number of activations for each step is:
+    Attention:
+    Q, K, V: sbhL
+    QK^T: 2sbhaL
+    Softmax: as^2bL
+    Softmax dropout: as^2bL
+    Attention over Values: as^2bL + sbhL
+
+    MLP:
+    I'm assuming this structure of the MLP:
+    Linear layer, Activation function, Linear layer
+    With the first linear layer transforming h dimensions into some higher
+    dimension mh and the second linear layer projecting it back.
+    Then, the memory required is:
+    1st Linear layer: sbhL
+    Activation: sbmhL
+    2nd Linear layer: sbmhL
+
+    Total = (3 + 2a + 2m)sbhL + 3as^2bL
+    """
+    
     try:
         return (
-            batch_size
-            * sequence_length
-            * hidden_size
-            * (34 + (5 * sequence_length * num_attention_heads) / hidden_size)
-            * DATA_TYPE_SIZES[precision]
-        )
+            (
+                (3 + 2 * num_attention_heads + 2 * mlp_multiple)
+                * sequence_length
+                * batch_size
+                * hidden_size
+                * num_hidden_layers
+            ) + (
+                3
+                * num_attention_heads
+                * sequence_length ** 2
+                * batch_size
+                * num_hidden_layers
+            )
+        ) * DATA_TYPE_SIZES[precision]
     except:
         return 0
 
@@ -115,7 +154,8 @@ def calculate_inference_memory(
         precision, batch_size, sequence_length, hidden_size, num_hidden_layers
     )
     activation_memory = get_activation_memory(
-        batch_size, sequence_length, hidden_size, num_attention_heads
+        batch_size, sequence_length, hidden_size, num_attention_heads, precision,
+        num_hidden_layers
     )
     return {
         "model_weights": get_memory(model_weights),
@@ -139,11 +179,9 @@ def calculate_training_memory(
 ):
     """Calculate the total memory required for training."""
     model_weights = get_model_weights(model_size, precision)
-    kv_cache = get_kv_cache(
-        precision, batch_size, sequence_length, hidden_size, num_hidden_layers
-    )
     activation_memory = get_activation_memory(
-        batch_size, sequence_length, hidden_size, num_attention_heads
+        batch_size, sequence_length, hidden_size, num_attention_heads, precision,
+        num_hidden_layers
     )
     optimizer_memory = (
         get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100
@@ -154,13 +192,11 @@ def calculate_training_memory(
 
     return {
         "model_weights": get_memory(model_weights),
-        "kv_cache": get_memory(kv_cache),
         "activation_memory": get_memory(activation_memory),
         "optimizer_memory": get_memory(optimizer_memory),
         "gradients_memory": get_memory(gradients_memory),
         "training_memory": get_memory(
             model_weights,
-            kv_cache,
             activation_memory,
             optimizer_memory,
             gradients_memory,

From 18ec16487e3e47ab8c5e31dc11f3bef556c6a3c8 Mon Sep 17 00:00:00 2001
From: Krish Vaswani <krishvaswani@yahoo.com>
Date: Tue, 21 Oct 2025 11:33:47 +0000
Subject: [PATCH 2/2] Added MLP layer size as a parameter

---
 llm_tools/pages/1_Memory.py     | 10 ++++++++++
 llm_tools/utils/memory_utils.py | 24 ++++++++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/llm_tools/pages/1_Memory.py b/llm_tools/pages/1_Memory.py
index aeadb5d..2db1f16 100644
--- a/llm_tools/pages/1_Memory.py
+++ b/llm_tools/pages/1_Memory.py
@@ -95,6 +95,14 @@ def set_values():
     key="num_attention_heads",
     help="Number of attention heads in the model (given by the model card).",
 )
+mlp_layer_size = st.sidebar.number_input(
+    "MLP Layer Size",
+    min_value=0,
+    step=1,
+    value=None,
+    key="mlp_layer_size",
+    help="Size of the MLP layer (usually 4x the hidden size).",
+)
 
 
 # ----------------- Main Screen UI ----------------- #
@@ -117,6 +125,7 @@ def set_values():
     hidden_size,
     num_hidden_layers,
     num_attention_heads,
+    mlp_layer_size,
 )
 
 inference.write(f"**Total Inference Memory**: {inference_memory['inference_memory']}")
@@ -136,6 +145,7 @@ def set_values():
     num_attention_heads,
     optimizer,
     trainable_parameters,
+    mlp_layer_size,
 )
 
 training1.write(f"**Total Training Memory**: {training_memory['training_memory']}")
diff --git a/llm_tools/utils/memory_utils.py b/llm_tools/utils/memory_utils.py
index f14959c..8265b9d 100644
--- a/llm_tools/utils/memory_utils.py
+++ b/llm_tools/utils/memory_utils.py
@@ -65,7 +65,7 @@ def get_kv_cache(
 @st.cache_data
 def get_activation_memory(
     batch_size, sequence_length, hidden_size, num_attention_heads, precision,
-    num_hidden_layers, mlp_multiple=4
+    num_hidden_layers, mlp_layer_size
 ):
     """
     Calculate the memory required for activations. It references this paper:
@@ -90,19 +90,19 @@ def get_activation_memory(
     I'm assuming this structure of the MLP:
     Linear layer, Activation function, Linear layer
     With the first linear layer transforming h dimensions into some higher
-    dimension mh and the second linear layer projecting it back.
+    dimension m and the second linear layer projecting it back.
     Then, the memory required is:
     1st Linear layer: sbhL
-    Activation: sbmhL
-    2nd Linear layer: sbmhL
+    Activation: sbmL
+    2nd Linear layer: sbmL
 
-    Total = (3 + 2a + 2m)sbhL + 3as^2bL
+    Total = (3 + 2a)sbhL + 3as^2bL + 2msbL
     """
     
     try:
         return (
             (
-                (3 + 2 * num_attention_heads + 2 * mlp_multiple)
+                (3 + 2 * num_attention_heads)
                 * sequence_length
                 * batch_size
                 * hidden_size
@@ -113,6 +113,12 @@ def get_activation_memory(
                 * sequence_length ** 2
                 * batch_size
                 * num_hidden_layers
+            ) + (
+                2
+                * mlp_layer_size
+                * sequence_length
+                * batch_size
+                * num_hidden_layers
             )
         ) * DATA_TYPE_SIZES[precision]
     except:
@@ -147,6 +153,7 @@ def calculate_inference_memory(
     hidden_size,
     num_hidden_layers,
     num_attention_heads,
+    mlp_layer_size,
 ):
     """Calculate the total memory required for inference."""
     model_weights = get_model_weights(model_size, precision)
@@ -155,7 +162,7 @@ def calculate_inference_memory(
     )
     activation_memory = get_activation_memory(
         batch_size, sequence_length, hidden_size, num_attention_heads, precision,
-        num_hidden_layers
+        num_hidden_layers, mlp_layer_size
     )
     return {
         "model_weights": get_memory(model_weights),
@@ -176,12 +183,13 @@ def calculate_training_memory(
     num_attention_heads,
     optimizer,
     trainable_parameters,
+    mlp_layer_size
 ):
     """Calculate the total memory required for training."""
     model_weights = get_model_weights(model_size, precision)
     activation_memory = get_activation_memory(
         batch_size, sequence_length, hidden_size, num_attention_heads, precision,
-        num_hidden_layers
+        num_hidden_layers, mlp_layer_size
     )
     optimizer_memory = (
         get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100