Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llm_tools/pages/1_Memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ def set_values():
key="num_attention_heads",
help="Number of attention heads in the model (given by the model card).",
)
mlp_layer_size = st.sidebar.number_input(
"MLP Layer Size",
min_value=0,
step=1,
value=None,
key="mlp_layer_size",
help="Size of the MLP layer (usually 4x the hidden size).",
)


# ----------------- Main Screen UI ----------------- #
Expand All @@ -117,6 +125,7 @@ def set_values():
hidden_size,
num_hidden_layers,
num_attention_heads,
mlp_layer_size,
)

inference.write(f"**Total Inference Memory**: {inference_memory['inference_memory']}")
Expand All @@ -136,6 +145,7 @@ def set_values():
num_attention_heads,
optimizer,
trainable_parameters,
mlp_layer_size,
)

training1.write(f"**Total Training Memory**: {training_memory['training_memory']}")
Expand Down
76 changes: 60 additions & 16 deletions llm_tools/utils/memory_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,63 @@ def get_kv_cache(

@st.cache_data
def get_activation_memory(
batch_size, sequence_length, hidden_size, num_attention_heads
batch_size, sequence_length, hidden_size, num_attention_heads, precision,
num_hidden_layers, mlp_layer_size
):
"""Calculate the memory required for activations."""
precision = "float32"
"""
Calculate the memory required for activations. It references this paper:
https://proceedings.mlsys.org/paper_files/paper/2023/file/80083951326cf5b35e5100260d64ed81-Paper-mlsys2023.pdf

Let
s = sequence length
b = batch size
h = hidden size
a = attention heads
L = number of transformer layers

Then, the number of activations for each step is:
Attention:
Q, K, V: sbhL
QK^T: 2sbhaL
Softmax: as^2bL
Softmax dropout: as^2bL
Attention over Values: as^2bL + sbhL

MLP:
I'm assuming this structure of the MLP:
Linear layer, Activation function, Linear layer
With the first linear layer transforming h dimensions into some higher
dimension m and the second linear layer projecting it back.
Then, the memory required is:
1st Linear layer: sbhL
Activation: sbmL
2nd Linear layer: sbmL

Total = (3 + 2a)sbhL + 3as^2bL + 2msbL
"""

try:
return (
batch_size
* sequence_length
* hidden_size
* (34 + (5 * sequence_length * num_attention_heads) / hidden_size)
* DATA_TYPE_SIZES[precision]
)
(
(3 + 2 * num_attention_heads)
* sequence_length
* batch_size
* hidden_size
* num_hidden_layers
) + (
3
* num_attention_heads
* sequence_length ** 2
* batch_size
* num_hidden_layers
) + (
2
* mlp_layer_size
* sequence_length
* batch_size
* num_hidden_layers
)
) * DATA_TYPE_SIZES[precision]
except:
return 0

Expand Down Expand Up @@ -108,14 +153,16 @@ def calculate_inference_memory(
hidden_size,
num_hidden_layers,
num_attention_heads,
mlp_layer_size,
):
"""Calculate the total memory required for inference."""
model_weights = get_model_weights(model_size, precision)
kv_cache = get_kv_cache(
precision, batch_size, sequence_length, hidden_size, num_hidden_layers
)
activation_memory = get_activation_memory(
batch_size, sequence_length, hidden_size, num_attention_heads
batch_size, sequence_length, hidden_size, num_attention_heads, precision,
num_hidden_layers, mlp_layer_size
)
return {
"model_weights": get_memory(model_weights),
Expand All @@ -136,14 +183,13 @@ def calculate_training_memory(
num_attention_heads,
optimizer,
trainable_parameters,
mlp_layer_size
):
"""Calculate the total memory required for training."""
model_weights = get_model_weights(model_size, precision)
kv_cache = get_kv_cache(
precision, batch_size, sequence_length, hidden_size, num_hidden_layers
)
activation_memory = get_activation_memory(
batch_size, sequence_length, hidden_size, num_attention_heads
batch_size, sequence_length, hidden_size, num_attention_heads, precision,
num_hidden_layers, mlp_layer_size
)
optimizer_memory = (
get_optimizer_memory(model_size, optimizer) * trainable_parameters / 100
Expand All @@ -154,13 +200,11 @@ def calculate_training_memory(

return {
"model_weights": get_memory(model_weights),
"kv_cache": get_memory(kv_cache),
"activation_memory": get_memory(activation_memory),
"optimizer_memory": get_memory(optimizer_memory),
"gradients_memory": get_memory(gradients_memory),
"training_memory": get_memory(
model_weights,
kv_cache,
activation_memory,
optimizer_memory,
gradients_memory,
Expand Down