Commit
•
60517f0
1
Parent(s):
fc57cfc
Update app.py
Browse files
app.py
CHANGED
@@ -60,6 +60,55 @@ def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_par
|
|
60 |
|
61 |
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# ---- Gradio Interface ---- #
|
64 |
with gr.Blocks() as demo:
|
65 |
|
@@ -266,4 +315,97 @@ with gr.Blocks() as demo:
|
|
266 |
inputs=[hf_model_name_or_path],
|
267 |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
demo.launch()
|
|
|
60 |
|
61 |
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
|
62 |
|
63 |
+
# ---- FLOP Calculation ---- #
|
64 |
+
def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer):
|
65 |
+
# An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds)
|
66 |
+
|
67 |
+
# determine the flops factor.
|
68 |
+
iter_factor = 3
|
69 |
+
if checkpoint_activations:
|
70 |
+
iter_factor += 1
|
71 |
+
if infer:
|
72 |
+
iter_factor = 1
|
73 |
+
|
74 |
+
qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size)
|
75 |
+
attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
|
76 |
+
attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
|
77 |
+
linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size
|
78 |
+
ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size
|
79 |
+
embedding_flops = 6 * tokens * hidden_size * vocab_size
|
80 |
+
|
81 |
+
if moe and topk > 1:
|
82 |
+
ffn_flops += ffn_flops * topk / expert_interval
|
83 |
+
|
84 |
+
if moe:
|
85 |
+
gating_flops = 2 * num_experts * hidden_size / expert_interval
|
86 |
+
|
87 |
+
total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops
|
88 |
+
|
89 |
+
if moe:
|
90 |
+
total_flops += gating_flops
|
91 |
+
|
92 |
+
def convert_flops(params):
|
93 |
+
if params == 0:
|
94 |
+
return "0"
|
95 |
+
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
|
96 |
+
i = int(math.floor(math.log(params, 1000)))
|
97 |
+
p = math.pow(1000, i)
|
98 |
+
s = round(params / p, 2)
|
99 |
+
return f"{s} {size_name[i]}"
|
100 |
+
|
101 |
+
return {
|
102 |
+
'qkv_flops': convert_flops(qkv_flops),
|
103 |
+
'attention_matrix_flops': convert_flops(attention_matrix_flops),
|
104 |
+
'attention_over_values_flops': convert_flops(attention_over_values_flops),
|
105 |
+
'linear_projection_flops': convert_flops(linear_projection_flops),
|
106 |
+
'ffn_flops': convert_flops(ffn_flops),
|
107 |
+
'embedding_flops': convert_flops(embedding_flops),
|
108 |
+
'total_flops': convert_flops(total_flops)
|
109 |
+
}
|
110 |
+
|
111 |
+
|
112 |
# ---- Gradio Interface ---- #
|
113 |
with gr.Blocks() as demo:
|
114 |
|
|
|
315 |
inputs=[hf_model_name_or_path],
|
316 |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
|
317 |
|
318 |
+
# New FLOP Calculation Tab
|
319 |
+
with gr.TabItem("FLOP Calculation"):
|
320 |
+
gr.Markdown("""
|
321 |
+
## FLOP Calculation
|
322 |
+
|
323 |
+
FLOP Calculation estimates the number of floating point operations (FLOPs) for training or inference of a model.
|
324 |
+
Provide the necessary model hyperparameters and click 'Calculate FLOPs' to get a result.
|
325 |
+
""")
|
326 |
+
with gr.Row():
|
327 |
+
with gr.Column():
|
328 |
+
vocab_size = gr.Number(
|
329 |
+
label="Vocab Size",
|
330 |
+
value=51200,
|
331 |
+
info="How many tokens are in the embedding layer"
|
332 |
+
)
|
333 |
+
hidden_size = gr.Number(
|
334 |
+
label="Hidden Size",
|
335 |
+
value=6144,
|
336 |
+
info="Dimension of the model's hidden size"
|
337 |
+
)
|
338 |
+
sequence_length = gr.Number(
|
339 |
+
label="Sequence Length",
|
340 |
+
value=2048,
|
341 |
+
info="Sequence length used for training"
|
342 |
+
)
|
343 |
+
num_layers = gr.Number(
|
344 |
+
label="Number of Layers",
|
345 |
+
value=44,
|
346 |
+
info="Number of transformer layers used in the model"
|
347 |
+
)
|
348 |
+
kv_size_ratio = gr.Number(
|
349 |
+
label="KV Size Ratio",
|
350 |
+
value=1.0,
|
351 |
+
info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
|
352 |
+
)
|
353 |
+
topk = gr.Number(
|
354 |
+
label="Top K Routing for MoE",
|
355 |
+
value=1,
|
356 |
+
info="Top k routing for Mixture of Experts (MoE)"
|
357 |
+
)
|
358 |
+
moe = gr.Checkbox(
|
359 |
+
label="Mixture of Experts (MoE)",
|
360 |
+
value=False,
|
361 |
+
info="Whether the model uses Mixture of Experts"
|
362 |
+
)
|
363 |
+
num_experts = gr.Number(
|
364 |
+
label="Number of Experts",
|
365 |
+
value=128,
|
366 |
+
info="Number of experts for Mixture of Experts (MoE)"
|
367 |
+
)
|
368 |
+
expert_interval = gr.Number(
|
369 |
+
label="Expert Interval",
|
370 |
+
value=2,
|
371 |
+
info="Expert interval for Mixture of Experts (MoE)"
|
372 |
+
)
|
373 |
+
batch_size = gr.Number(
|
374 |
+
label="Batch Size",
|
375 |
+
value=1,
|
376 |
+
info="Global batch size in units of samples"
|
377 |
+
)
|
378 |
+
tokens = gr.Number(
|
379 |
+
label="Number of Tokens",
|
380 |
+
value=300e9,
|
381 |
+
info="Total number of tokens for training"
|
382 |
+
)
|
383 |
+
checkpoint_activations = gr.Checkbox(
|
384 |
+
label="Checkpoint Activations",
|
385 |
+
value=True,
|
386 |
+
info="Whether Megatron-style activation checkpointing is being used"
|
387 |
+
)
|
388 |
+
ffn_expansion_factor = gr.Number(
|
389 |
+
label="FFN Expansion Factor",
|
390 |
+
value=4,
|
391 |
+
info="How much the MLP hidden size expands"
|
392 |
+
)
|
393 |
+
infer = gr.Checkbox(
|
394 |
+
label="Inference-Only",
|
395 |
+
value=False,
|
396 |
+
info="Whether the model is being used for inference-only"
|
397 |
+
)
|
398 |
+
|
399 |
+
calc_flops_button = gr.Button("Calculate FLOPs")
|
400 |
+
flops_result = gr.JSON(label="FLOP Calculation Result", interactive=False)
|
401 |
+
calc_flops_button.click(
|
402 |
+
calc_flops,
|
403 |
+
inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
|
404 |
+
outputs=flops_result
|
405 |
+
)
|
406 |
+
|
407 |
+
hf_model_name_or_path = gr.Textbox(label="HuggingFace Model Name or Path", info="Name of the HuggingFace model or local path")
|
408 |
+
hf_model_name_or_path.change(fn=get_hf_model_args, inputs=[hf_model_name_or_path], outputs=[num_layers, hidden_size, vocab_size, sequence_length])
|
409 |
+
|
410 |
+
|
411 |
demo.launch()
|