diff --git a/llm_tools/pages/1_Memory.py b/llm_tools/pages/1_Memory.py index aeadb5d..2db1f16 100644 --- a/llm_tools/pages/1_Memory.py +++ b/llm_tools/pages/1_Memory.py @@ -95,6 +95,14 @@ def set_values(): key="num_attention_heads", help="Number of attention heads in the model (given by the model card).", ) +mlp_layer_size = st.sidebar.number_input( + "MLP Layer Size", + min_value=0, + step=1, + value=None, + key="mlp_layer_size", + help="Size of the MLP layer (usually 4x the hidden size).", +) # ----------------- Main Screen UI ----------------- # @@ -117,6 +125,7 @@ def set_values(): hidden_size, num_hidden_layers, num_attention_heads, + mlp_layer_size, ) inference.write(f"**Total Inference Memory**: {inference_memory['inference_memory']}") @@ -136,6 +145,7 @@ def set_values(): num_attention_heads, optimizer, trainable_parameters, + mlp_layer_size, ) training1.write(f"**Total Training Memory**: {training_memory['training_memory']}") diff --git a/llm_tools/utils/memory_utils.py b/llm_tools/utils/memory_utils.py index 3d50a9b..8265b9d 100644 --- a/llm_tools/utils/memory_utils.py +++ b/llm_tools/utils/memory_utils.py @@ -64,18 +64,63 @@ def get_kv_cache( @st.cache_data def get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers, mlp_layer_size ): - """Calculate the memory required for activations.""" - precision = "float32" + """ + Calculate the memory required for activations. It references this paper: + https://proceedings.mlsys.org/paper_files/paper/2023/file/80083951326cf5b35e5100260d64ed81-Paper-mlsys2023.pdf + + Let + s = sequence length + b = batch size + h = hidden size + a = attention heads + L = number of transformer layers + + Then, the number of activations for each step is: + Attention: + Q, K, V: sbhL + QK^T: 2sbhaL + Softmax: as^2bL + Softmax dropout: as^2bL + Attention over Values: as^2bL + sbhL + + MLP: + I'm assuming this structure of the MLP: + Linear layer, Activation function, Linear layer + With the first linear layer transforming h dimensions into some higher + dimension m and the second linear layer projecting it back. + Then, the memory required is: + 1st Linear layer: sbhL + Activation: sbmL + 2nd Linear layer: sbmL + + Total = (3 + 2a)sbhL + 3as^2bL + 2msbL + """ + try: return ( - batch_size - * sequence_length - * hidden_size - * (34 + (5 * sequence_length * num_attention_heads) / hidden_size) - * DATA_TYPE_SIZES[precision] - ) + ( + (3 + 2 * num_attention_heads) + * sequence_length + * batch_size + * hidden_size + * num_hidden_layers + ) + ( + 3 + * num_attention_heads + * sequence_length ** 2 + * batch_size + * num_hidden_layers + ) + ( + 2 + * mlp_layer_size + * sequence_length + * batch_size + * num_hidden_layers + ) + ) * DATA_TYPE_SIZES[precision] except: return 0 @@ -108,6 +153,7 @@ def calculate_inference_memory( hidden_size, num_hidden_layers, num_attention_heads, + mlp_layer_size, ): """Calculate the total memory required for inference.""" model_weights = get_model_weights(model_size, precision) @@ -115,7 +161,8 @@ def calculate_inference_memory( precision, batch_size, sequence_length, hidden_size, num_hidden_layers ) activation_memory = get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers, mlp_layer_size ) return { "model_weights": get_memory(model_weights), @@ -136,14 +183,13 @@ def calculate_training_memory( num_attention_heads, optimizer, trainable_parameters, + mlp_layer_size ): """Calculate the total memory required for training.""" model_weights = get_model_weights(model_size, precision) - kv_cache = get_kv_cache( - precision, batch_size, sequence_length, hidden_size, num_hidden_layers - ) activation_memory = get_activation_memory( - batch_size, sequence_length, hidden_size, num_attention_heads + batch_size, sequence_length, hidden_size, num_attention_heads, precision, + num_hidden_layers, mlp_layer_size ) optimizer_memory = ( get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100 @@ -154,13 +200,11 @@ def calculate_training_memory( return { "model_weights": get_memory(model_weights), - "kv_cache": get_memory(kv_cache), "activation_memory": get_memory(activation_memory), "optimizer_memory": get_memory(optimizer_memory), "gradients_memory": get_memory(gradients_memory), "training_memory": get_memory( model_weights, - kv_cache, activation_memory, optimizer_memory, gradients_memory,