derek-thomas HF staff commited on
Commit
60517f0
1 Parent(s): fc57cfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py CHANGED
@@ -60,6 +60,55 @@ def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_par
60
 
61
  return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # ---- Gradio Interface ---- #
64
  with gr.Blocks() as demo:
65
 
@@ -266,4 +315,97 @@ with gr.Blocks() as demo:
266
  inputs=[hf_model_name_or_path],
267
  outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  demo.launch()
 
60
 
61
  return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
62
 
63
+ # ---- FLOP Calculation ---- #
64
+ def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer):
65
+ # An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds)
66
+
67
+ # determine the flops factor.
68
+ iter_factor = 3
69
+ if checkpoint_activations:
70
+ iter_factor += 1
71
+ if infer:
72
+ iter_factor = 1
73
+
74
+ qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size)
75
+ attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
76
+ attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
77
+ linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size
78
+ ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size
79
+ embedding_flops = 6 * tokens * hidden_size * vocab_size
80
+
81
+ if moe and topk > 1:
82
+ ffn_flops += ffn_flops * topk / expert_interval
83
+
84
+ if moe:
85
+ gating_flops = 2 * num_experts * hidden_size / expert_interval
86
+
87
+ total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops
88
+
89
+ if moe:
90
+ total_flops += gating_flops
91
+
92
+ def convert_flops(params):
93
+ if params == 0:
94
+ return "0"
95
+ size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
96
+ i = int(math.floor(math.log(params, 1000)))
97
+ p = math.pow(1000, i)
98
+ s = round(params / p, 2)
99
+ return f"{s} {size_name[i]}"
100
+
101
+ return {
102
+ 'qkv_flops': convert_flops(qkv_flops),
103
+ 'attention_matrix_flops': convert_flops(attention_matrix_flops),
104
+ 'attention_over_values_flops': convert_flops(attention_over_values_flops),
105
+ 'linear_projection_flops': convert_flops(linear_projection_flops),
106
+ 'ffn_flops': convert_flops(ffn_flops),
107
+ 'embedding_flops': convert_flops(embedding_flops),
108
+ 'total_flops': convert_flops(total_flops)
109
+ }
110
+
111
+
112
  # ---- Gradio Interface ---- #
113
  with gr.Blocks() as demo:
114
 
 
315
  inputs=[hf_model_name_or_path],
316
  outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
317
 
318
+ # New FLOP Calculation Tab
319
+ with gr.TabItem("FLOP Calculation"):
320
+ gr.Markdown("""
321
+ ## FLOP Calculation
322
+
323
+ FLOP Calculation estimates the number of floating point operations (FLOPs) for training or inference of a model.
324
+ Provide the necessary model hyperparameters and click 'Calculate FLOPs' to get a result.
325
+ """)
326
+ with gr.Row():
327
+ with gr.Column():
328
+ vocab_size = gr.Number(
329
+ label="Vocab Size",
330
+ value=51200,
331
+ info="How many tokens are in the embedding layer"
332
+ )
333
+ hidden_size = gr.Number(
334
+ label="Hidden Size",
335
+ value=6144,
336
+ info="Dimension of the model's hidden size"
337
+ )
338
+ sequence_length = gr.Number(
339
+ label="Sequence Length",
340
+ value=2048,
341
+ info="Sequence length used for training"
342
+ )
343
+ num_layers = gr.Number(
344
+ label="Number of Layers",
345
+ value=44,
346
+ info="Number of transformer layers used in the model"
347
+ )
348
+ kv_size_ratio = gr.Number(
349
+ label="KV Size Ratio",
350
+ value=1.0,
351
+ info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
352
+ )
353
+ topk = gr.Number(
354
+ label="Top K Routing for MoE",
355
+ value=1,
356
+ info="Top k routing for Mixture of Experts (MoE)"
357
+ )
358
+ moe = gr.Checkbox(
359
+ label="Mixture of Experts (MoE)",
360
+ value=False,
361
+ info="Whether the model uses Mixture of Experts"
362
+ )
363
+ num_experts = gr.Number(
364
+ label="Number of Experts",
365
+ value=128,
366
+ info="Number of experts for Mixture of Experts (MoE)"
367
+ )
368
+ expert_interval = gr.Number(
369
+ label="Expert Interval",
370
+ value=2,
371
+ info="Expert interval for Mixture of Experts (MoE)"
372
+ )
373
+ batch_size = gr.Number(
374
+ label="Batch Size",
375
+ value=1,
376
+ info="Global batch size in units of samples"
377
+ )
378
+ tokens = gr.Number(
379
+ label="Number of Tokens",
380
+ value=300e9,
381
+ info="Total number of tokens for training"
382
+ )
383
+ checkpoint_activations = gr.Checkbox(
384
+ label="Checkpoint Activations",
385
+ value=True,
386
+ info="Whether Megatron-style activation checkpointing is being used"
387
+ )
388
+ ffn_expansion_factor = gr.Number(
389
+ label="FFN Expansion Factor",
390
+ value=4,
391
+ info="How much the MLP hidden size expands"
392
+ )
393
+ infer = gr.Checkbox(
394
+ label="Inference-Only",
395
+ value=False,
396
+ info="Whether the model is being used for inference-only"
397
+ )
398
+
399
+ calc_flops_button = gr.Button("Calculate FLOPs")
400
+ flops_result = gr.JSON(label="FLOP Calculation Result", interactive=False)
401
+ calc_flops_button.click(
402
+ calc_flops,
403
+ inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
404
+ outputs=flops_result
405
+ )
406
+
407
+ hf_model_name_or_path = gr.Textbox(label="HuggingFace Model Name or Path", info="Name of the HuggingFace model or local path")
408
+ hf_model_name_or_path.change(fn=get_hf_model_args, inputs=[hf_model_name_or_path], outputs=[num_layers, hidden_size, vocab_size, sequence_length])
409
+
410
+
411
  demo.launch()