From 987139fad49867f92035ac01f929ba0c9ac5fc03 Mon Sep 17 00:00:00 2001 From: Krish Vaswani Date: Tue, 21 Oct 2025 14:06:15 +0530 Subject: [PATCH 1/2] Fixed some numerical mistakes in the base code --- llm_tools/utils/memory_utils.py | 68 +++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/llm_tools/utils/memory_utils.py b/llm_tools/utils/memory_utils.py index 3d50a9b..f14959c 100644 --- a/llm_tools/utils/memory_utils.py +++ b/llm_tools/utils/memory_utils.py @@ -64,18 +64,57 @@ def get_kv_cache( @st.cache_data def get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers, mlp_multiple=4 ): - """Calculate the memory required for activations.""" - precision = "float32" + """ + Calculate the memory required for activations. It references this paper: + https://proceedings.mlsys.org/paper_files/paper/2023/file/80083951326cf5b35e5100260d64ed81-Paper-mlsys2023.pdf + + Let + s = sequence length + b = batch size + h = hidden size + a = attention heads + L = number of transformer layers + + Then, the number of activations for each step is: + Attention: + Q, K, V: sbhL + QK^T: 2sbhaL + Softmax: as^2bL + Softmax dropout: as^2bL + Attention over Values: as^2bL + sbhL + + MLP: + I'm assuming this structure of the MLP: + Linear layer, Activation function, Linear layer + With the first linear layer transforming h dimensions into some higher + dimension mh and the second linear layer projecting it back. + Then, the memory required is: + 1st Linear layer: sbhL + Activation: sbmhL + 2nd Linear layer: sbmhL + + Total = (3 + 2a + 2m)sbhL + 3as^2bL + """ + try: return ( - batch_size - * sequence_length - * hidden_size - * (34 + (5 * sequence_length * num_attention_heads) / hidden_size) - * DATA_TYPE_SIZES[precision] - ) + ( + (3 + 2 * num_attention_heads + 2 * mlp_multiple) + * sequence_length + * batch_size + * hidden_size + * num_hidden_layers + ) + ( + 3 + * num_attention_heads + * sequence_length ** 2 + * batch_size + * num_hidden_layers + ) + ) * DATA_TYPE_SIZES[precision] except: return 0 @@ -115,7 +154,8 @@ def calculate_inference_memory( precision, batch_size, sequence_length, hidden_size, num_hidden_layers ) activation_memory = get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers ) return { "model_weights": get_memory(model_weights), @@ -139,11 +179,9 @@ def calculate_training_memory( ): """Calculate the total memory required for training.""" model_weights = get_model_weights(model_size, precision) - kv_cache = get_kv_cache( - precision, batch_size, sequence_length, hidden_size, num_hidden_layers - ) activation_memory = get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers ) optimizer_memory = ( get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100 @@ -154,13 +192,11 @@ def calculate_training_memory( return { "model_weights": get_memory(model_weights), - "kv_cache": get_memory(kv_cache), "activation_memory": get_memory(activation_memory), "optimizer_memory": get_memory(optimizer_memory), "gradients_memory": get_memory(gradients_memory), "training_memory": get_memory( model_weights, - kv_cache, activation_memory, optimizer_memory, gradients_memory, From 18ec16487e3e47ab8c5e31dc11f3bef556c6a3c8 Mon Sep 17 00:00:00 2001 From: Krish Vaswani Date: Tue, 21 Oct 2025 11:33:47 +0000 Subject: [PATCH 2/2] Added MLP layer size as a parameter --- llm_tools/pages/1_Memory.py | 10 ++++++++++ llm_tools/utils/memory_utils.py | 24 ++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/llm_tools/pages/1_Memory.py b/llm_tools/pages/1_Memory.py index aeadb5d..2db1f16 100644 --- a/llm_tools/pages/1_Memory.py +++ b/llm_tools/pages/1_Memory.py @@ -95,6 +95,14 @@ def set_values(): key="num_attention_heads", help="Number of attention heads in the model (given by the model card).", ) +mlp_layer_size = st.sidebar.number_input( + "MLP Layer Size", + min_value=0, + step=1, + value=None, + key="mlp_layer_size", + help="Size of the MLP layer (usually 4x the hidden size).", +) # ----------------- Main Screen UI ----------------- # @@ -117,6 +125,7 @@ def set_values(): hidden_size, num_hidden_layers, num_attention_heads, + mlp_layer_size, ) inference.write(f"**Total Inference Memory**: {inference_memory['inference_memory']}") @@ -136,6 +145,7 @@ def set_values(): num_attention_heads, optimizer, trainable_parameters, + mlp_layer_size, ) training1.write(f"**Total Training Memory**: {training_memory['training_memory']}") diff --git a/llm_tools/utils/memory_utils.py b/llm_tools/utils/memory_utils.py index f14959c..8265b9d 100644 --- a/llm_tools/utils/memory_utils.py +++ b/llm_tools/utils/memory_utils.py @@ -65,7 +65,7 @@ def get_kv_cache( @st.cache_data def get_activation_memory( batch_size, sequence_length, hidden_size, num_attention_heads, precision, - num_hidden_layers, mlp_multiple=4 + num_hidden_layers, mlp_layer_size ): """ Calculate the memory required for activations. It references this paper: @@ -90,19 +90,19 @@ def get_activation_memory( I'm assuming this structure of the MLP: Linear layer, Activation function, Linear layer With the first linear layer transforming h dimensions into some higher - dimension mh and the second linear layer projecting it back. + dimension m and the second linear layer projecting it back. Then, the memory required is: 1st Linear layer: sbhL - Activation: sbmhL - 2nd Linear layer: sbmhL + Activation: sbmL + 2nd Linear layer: sbmL - Total = (3 + 2a + 2m)sbhL + 3as^2bL + Total = (3 + 2a)sbhL + 3as^2bL + 2msbL """ try: return ( ( - (3 + 2 * num_attention_heads + 2 * mlp_multiple) + (3 + 2 * num_attention_heads) * sequence_length * batch_size * hidden_size @@ -113,6 +113,12 @@ def get_activation_memory( * sequence_length ** 2 * batch_size * num_hidden_layers + ) + ( + 2 + * mlp_layer_size + * sequence_length + * batch_size + * num_hidden_layers ) ) * DATA_TYPE_SIZES[precision] except: @@ -147,6 +153,7 @@ def calculate_inference_memory( hidden_size, num_hidden_layers, num_attention_heads, + mlp_layer_size, ): """Calculate the total memory required for inference.""" model_weights = get_model_weights(model_size, precision) @@ -155,7 +162,7 @@ def calculate_inference_memory( ) activation_memory = get_activation_memory( batch_size, sequence_length, hidden_size, num_attention_heads, precision, - num_hidden_layers + num_hidden_layers, mlp_layer_size ) return { "model_weights": get_memory(model_weights), @@ -176,12 +183,13 @@ def calculate_training_memory( num_attention_heads, optimizer, trainable_parameters, + mlp_layer_size ): """Calculate the total memory required for training.""" model_weights = get_model_weights(model_size, precision) activation_memory = get_activation_memory( batch_size, sequence_length, hidden_size, num_attention_heads, precision, - num_hidden_layers + num_hidden_layers, mlp_layer_size ) optimizer_memory = ( get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100