Update calculations
Browse files- src/app.py +17 -11
- src/model_utils.py +10 -10
- src/parallelism_utils.py +87 -57
src/app.py
CHANGED
@@ -9,7 +9,7 @@ from huggingface_hub.utils import HfHubHTTPError
|
|
9 |
MODEL = None
|
10 |
|
11 |
|
12 |
-
def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list):
|
13 |
global MODEL
|
14 |
MODEL = get_model(model_name, library, access_token)
|
15 |
try:
|
@@ -26,6 +26,7 @@ def get_results(model_name: str, library: str, precision: list, training: list,
|
|
26 |
"num_nodes": num_nodes,
|
27 |
"num_gpus_per_node": num_gpus,
|
28 |
"training_regime": training,
|
|
|
29 |
}
|
30 |
data = calculate_memory(MODEL, options)
|
31 |
|
@@ -36,7 +37,7 @@ def get_results(model_name: str, library: str, precision: list, training: list,
|
|
36 |
with gr.Blocks() as demo:
|
37 |
with gr.Column():
|
38 |
gr.Markdown(
|
39 |
-
"""<img src="https://huggingface.co/spaces/
|
40 |
|
41 |
This tool will help you calculate how much vRAM is needed to train and perform big model inference
|
42 |
on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
|
@@ -74,16 +75,21 @@ with gr.Blocks() as demo:
|
|
74 |
label="Training Paradigm",
|
75 |
)
|
76 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
79 |
zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
|
80 |
zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
87 |
with gr.Row():
|
88 |
btn = gr.Button("Calculate Memory Usage")
|
89 |
post_to_hub = gr.Button(
|
@@ -135,7 +141,7 @@ with gr.Blocks() as demo:
|
|
135 |
|
136 |
btn.click(
|
137 |
get_results,
|
138 |
-
inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init],
|
139 |
outputs=[out_text, out, post_to_hub],
|
140 |
)
|
141 |
|
|
|
9 |
MODEL = None
|
10 |
|
11 |
|
12 |
+
def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list, additional_buffer_factor: float):
|
13 |
global MODEL
|
14 |
MODEL = get_model(model_name, library, access_token)
|
15 |
try:
|
|
|
26 |
"num_nodes": num_nodes,
|
27 |
"num_gpus_per_node": num_gpus,
|
28 |
"training_regime": training,
|
29 |
+
"additional_buffer_factor": additional_buffer_factor
|
30 |
}
|
31 |
data = calculate_memory(MODEL, options)
|
32 |
|
|
|
37 |
with gr.Blocks() as demo:
|
38 |
with gr.Column():
|
39 |
gr.Markdown(
|
40 |
+
"""<img src="https://huggingface.co/spaces/andstor/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 DeepSpeed Model Memory Calculator</h1>
|
41 |
|
42 |
This tool will help you calculate how much vRAM is needed to train and perform big model inference
|
43 |
on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
|
|
|
75 |
label="Training Paradigm",
|
76 |
)
|
77 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
78 |
+
num_gpus = gr.Number(label="GPUs per node", value=4, minimum=1, step=1)
|
79 |
+
num_nodes = gr.Number(label="Nodes", value=1, minimum=1, step=1)
|
80 |
+
with gr.Column(variant="panel"):
|
81 |
+
with gr.Row(equal_height=True):
|
82 |
+
|
83 |
zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
|
84 |
zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
|
85 |
+
|
86 |
+
with gr.Row(equal_height=True):
|
87 |
+
#with gr.Column():
|
88 |
+
offloading = gr.CheckboxGroup(["Optimizer", "Parameters"], label="ZeRO-Offload", info="Offloading data and compute to CPU", value=["Optimizer", "Parameters"])
|
89 |
+
zero_init = gr.CheckboxGroup(["zero.Init"], value=["zero.Init"], label="Initialization")
|
90 |
+
|
91 |
+
#with gr.Column():
|
92 |
+
additional_buffer_factor = gr.Number(label="Additional Buffer Factor", value=1.5, minimum=1, step=0.1)
|
93 |
with gr.Row():
|
94 |
btn = gr.Button("Calculate Memory Usage")
|
95 |
post_to_hub = gr.Button(
|
|
|
141 |
|
142 |
btn.click(
|
143 |
get_results,
|
144 |
+
inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init, additional_buffer_factor],
|
145 |
outputs=[out_text, out, post_to_hub],
|
146 |
)
|
147 |
|
src/model_utils.py
CHANGED
@@ -10,8 +10,8 @@ from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_ze
|
|
10 |
|
11 |
|
12 |
DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
|
13 |
-
|
14 |
-
|
15 |
|
16 |
|
17 |
def extract_from_url(name: str):
|
@@ -93,25 +93,25 @@ def calculate_memory(model: torch.nn.Module, options: dict):
|
|
93 |
|
94 |
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
95 |
|
96 |
-
|
97 |
-
|
98 |
|
99 |
if options["zero_stage"] == 0:
|
100 |
cpu_mem = dtype_total_size * 4
|
101 |
gpu_mem = cpu_mem
|
102 |
elif options["zero_stage"] == 1:
|
103 |
-
cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"],
|
104 |
elif options["zero_stage"] == 2:
|
105 |
-
cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"],
|
106 |
elif options["zero_stage"] == 3:
|
107 |
-
cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"],
|
108 |
data.append(
|
109 |
{
|
110 |
-
"dtype": dtype,
|
111 |
"Largest Layer or Residual Group": dtype_largest_layer,
|
112 |
-
"
|
113 |
"per CPU": convert_bytes(cpu_mem),
|
114 |
-
"per GPU
|
115 |
}
|
116 |
)
|
117 |
|
|
|
10 |
|
11 |
|
12 |
DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
|
13 |
+
PRECISION = {"Mixed precision": "mixed", "Single precision": "single"}
|
14 |
+
DTYPE = {"float32": torch.float32, "float16/bfloat16": torch.float16}
|
15 |
|
16 |
|
17 |
def extract_from_url(name: str):
|
|
|
93 |
|
94 |
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
95 |
|
96 |
+
precision = PRECISION[options["training_regime"]]
|
97 |
+
model_dtype = DTYPE[dtype]
|
98 |
|
99 |
if options["zero_stage"] == 0:
|
100 |
cpu_mem = dtype_total_size * 4
|
101 |
gpu_mem = cpu_mem
|
102 |
elif options["zero_stage"] == 1:
|
103 |
+
cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
|
104 |
elif options["zero_stage"] == 2:
|
105 |
+
cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
|
106 |
elif options["zero_stage"] == 3:
|
107 |
+
cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], options["additional_buffer_factor"], precision, model_dtype)
|
108 |
data.append(
|
109 |
{
|
110 |
+
"Model dtype": dtype,
|
111 |
"Largest Layer or Residual Group": dtype_largest_layer,
|
112 |
+
"Model Size": convert_bytes(dtype_total_size),
|
113 |
"per CPU": convert_bytes(cpu_mem),
|
114 |
+
"per GPU": convert_bytes(gpu_mem),
|
115 |
}
|
116 |
)
|
117 |
|
src/parallelism_utils.py
CHANGED
@@ -1,65 +1,90 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
def estimate_zero1_model_states_mem_needs(total_params,
|
3 |
num_gpus_per_node=1,
|
4 |
num_nodes=1,
|
5 |
cpu_offload=True,
|
6 |
additional_buffer_factor=1.5,
|
7 |
-
|
8 |
-
|
9 |
):
|
10 |
|
11 |
-
# TODO: check if params_fac is needed during full fp32 training.
|
12 |
-
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
13 |
-
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
14 |
-
|
15 |
-
total_gpus = num_nodes * num_gpus_per_node
|
16 |
-
|
17 |
-
master_params_fac = 4
|
18 |
-
variance_fac = 4
|
19 |
-
momentum_fac = 4
|
20 |
-
grads_fac = 4
|
21 |
-
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
22 |
-
|
23 |
total_gpus = num_nodes * num_gpus_per_node
|
|
|
|
|
|
|
24 |
|
25 |
if cpu_offload:
|
26 |
-
gpu_mem = (precision_fac * total_params) + (
|
27 |
-
cpu_mem = total_params * max(params_fac * total_gpus, (
|
28 |
else:
|
29 |
-
|
|
|
|
|
|
|
30 |
cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
31 |
|
32 |
return int(cpu_mem), int(gpu_mem)
|
33 |
|
|
|
34 |
def estimate_zero2_model_states_mem_needs(total_params,
|
35 |
num_gpus_per_node=1,
|
36 |
num_nodes=1,
|
37 |
cpu_offload=True,
|
38 |
additional_buffer_factor=1.5,
|
39 |
-
|
40 |
-
|
41 |
):
|
42 |
-
|
43 |
-
# TODO: check if params_fac is needed during full fp32 training.
|
44 |
-
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
45 |
-
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
46 |
|
47 |
total_gpus = num_nodes * num_gpus_per_node
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
momentum_fac = 4
|
52 |
-
grads_fac = 4
|
53 |
-
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
54 |
-
|
55 |
-
total_gpus = num_nodes * num_gpus_per_node
|
56 |
|
57 |
if cpu_offload:
|
58 |
-
gpu_mem = precision_fac * total_params
|
59 |
-
cpu_mem = total_params * max(params_fac * total_gpus, (
|
60 |
else:
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
return int(cpu_mem), int(gpu_mem)
|
65 |
|
@@ -72,43 +97,48 @@ def estimate_zero3_model_states_mem_needs(total_params,
|
|
72 |
cpu_offload_params=True,
|
73 |
zero_init=True,
|
74 |
additional_buffer_factor=1.5,
|
75 |
-
|
76 |
-
|
77 |
):
|
78 |
|
79 |
-
# TODO: check if params_fac is needed during full fp32 training.
|
80 |
-
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
81 |
-
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
82 |
-
|
83 |
total_gpus = num_nodes * num_gpus_per_node
|
84 |
gpus_factor = 1 / num_nodes
|
85 |
-
master_params_fac = 4
|
86 |
-
variance_fac = 4
|
87 |
-
momentum_fac = 4
|
88 |
-
grads_fac = 4
|
89 |
-
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
90 |
|
91 |
-
|
|
|
|
|
|
|
|
|
92 |
|
93 |
if cpu_offload:
|
94 |
if cpu_offload_params:
|
95 |
gpu_mem = largest_layer_memory
|
96 |
-
|
97 |
if zero_init:
|
98 |
-
cpu_mem = total_params * (
|
99 |
else:
|
100 |
-
|
101 |
-
|
102 |
else:
|
103 |
-
gpu_mem =
|
|
|
|
|
|
|
104 |
|
105 |
if zero_init:
|
106 |
-
cpu_mem = total_params * (
|
107 |
else:
|
108 |
-
cpu_mem = total_params * max(params_fac * num_gpus_per_node, (
|
109 |
else:
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
if zero_init:
|
114 |
cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def get_precision_fac(precision: str):
|
4 |
+
if precision == "mixed":
|
5 |
+
return 2
|
6 |
+
elif precision == "single":
|
7 |
+
return 4
|
8 |
+
else:
|
9 |
+
raise ValueError("Precision must be either 'mixed' or 'single'")
|
10 |
+
|
11 |
+
|
12 |
+
def get_params_fac(model_dtype: torch.dtype):
|
13 |
+
if model_dtype == torch.float16:
|
14 |
+
return 2
|
15 |
+
elif model_dtype == torch.float32:
|
16 |
+
return 4
|
17 |
+
else:
|
18 |
+
raise ValueError("Model dtype must be either torch.float16 or torch.float32")
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
####################### Zero Redundancy Optimizer (ZeRO) #######################
|
23 |
+
|
24 |
+
VARIANCE_FACTOR = 4
|
25 |
+
MOMENTUM_FACTOR = 4
|
26 |
+
OPTIMIZER_FACTOR = VARIANCE_FACTOR + MOMENTUM_FACTOR # Adam optimizer
|
27 |
+
FP32_GRADS_FACTOR = 4
|
28 |
+
FP32_PARAM_FACTOR = 4
|
29 |
+
MASTER_PARAMS_FACTOR = FP32_PARAM_FACTOR
|
30 |
+
|
31 |
+
|
32 |
+
# TODO: check if params_fac is needed during full fp32 training.
|
33 |
+
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
34 |
+
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
def estimate_zero1_model_states_mem_needs(total_params,
|
39 |
num_gpus_per_node=1,
|
40 |
num_nodes=1,
|
41 |
cpu_offload=True,
|
42 |
additional_buffer_factor=1.5,
|
43 |
+
precision="mixed",
|
44 |
+
model_dtype = torch.float16,
|
45 |
):
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
total_gpus = num_nodes * num_gpus_per_node
|
48 |
+
|
49 |
+
precision_fac = get_precision_fac(precision)
|
50 |
+
params_fac = get_params_fac(model_dtype)
|
51 |
|
52 |
if cpu_offload:
|
53 |
+
gpu_mem = (precision_fac * total_params) # + (grads_fac * total_params)
|
54 |
+
cpu_mem = total_params * max(params_fac * total_gpus, (MASTER_PARAMS_FACTOR + OPTIMIZER_FACTOR + FP32_GRADS_FACTOR)) * additional_buffer_factor
|
55 |
else:
|
56 |
+
if precision == "mixed":
|
57 |
+
gpu_mem = (precision_fac * total_params) + (FP32_GRADS_FACTOR * total_params) + int((OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
|
58 |
+
else:
|
59 |
+
gpu_mem = (precision_fac * total_params) + (FP32_GRADS_FACTOR * total_params) + int(OPTIMIZER_FACTOR * total_params / total_gpus)
|
60 |
cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
61 |
|
62 |
return int(cpu_mem), int(gpu_mem)
|
63 |
|
64 |
+
|
65 |
def estimate_zero2_model_states_mem_needs(total_params,
|
66 |
num_gpus_per_node=1,
|
67 |
num_nodes=1,
|
68 |
cpu_offload=True,
|
69 |
additional_buffer_factor=1.5,
|
70 |
+
precision="mixed",
|
71 |
+
model_dtype = torch.float16,
|
72 |
):
|
|
|
|
|
|
|
|
|
73 |
|
74 |
total_gpus = num_nodes * num_gpus_per_node
|
75 |
+
|
76 |
+
precision_fac = get_precision_fac(precision)
|
77 |
+
params_fac = get_params_fac(model_dtype)
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
if cpu_offload:
|
80 |
+
gpu_mem = precision_fac * total_params # Negligible memory usage for partitioned gradients
|
81 |
+
cpu_mem = total_params * max(params_fac * total_gpus, (MASTER_PARAMS_FACTOR + OPTIMIZER_FACTOR + FP32_GRADS_FACTOR)) * additional_buffer_factor
|
82 |
else:
|
83 |
+
if precision == "mixed":
|
84 |
+
gpu_mem = precision_fac * total_params + int((FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
|
85 |
+
else:
|
86 |
+
gpu_mem = precision_fac * total_params + int((FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * total_params / total_gpus)
|
87 |
+
cpu_mem = params_fac * total_params * num_gpus_per_node * additional_buffer_factor
|
88 |
|
89 |
return int(cpu_mem), int(gpu_mem)
|
90 |
|
|
|
97 |
cpu_offload_params=True,
|
98 |
zero_init=True,
|
99 |
additional_buffer_factor=1.5,
|
100 |
+
precision="mixed",
|
101 |
+
model_dtype = torch.float16,
|
102 |
):
|
103 |
|
|
|
|
|
|
|
|
|
104 |
total_gpus = num_nodes * num_gpus_per_node
|
105 |
gpus_factor = 1 / num_nodes
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
precision_fac = get_precision_fac(precision)
|
108 |
+
params_fac = get_params_fac(model_dtype)
|
109 |
+
grads_fac = precision_fac
|
110 |
+
|
111 |
+
largest_layer_memory = (grads_fac + precision_fac) * largest_layer_params
|
112 |
|
113 |
if cpu_offload:
|
114 |
if cpu_offload_params:
|
115 |
gpu_mem = largest_layer_memory
|
|
|
116 |
if zero_init:
|
117 |
+
cpu_mem = total_params * (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + params_fac) * gpus_factor * additional_buffer_factor
|
118 |
else:
|
119 |
+
cpu_mem = total_params * max(params_fac * num_gpus_per_node, (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + params_fac) * gpus_factor) * additional_buffer_factor
|
120 |
+
|
121 |
else:
|
122 |
+
gpu_mem = max(
|
123 |
+
largest_layer_memory,
|
124 |
+
int((precision_fac) * total_params / total_gpus) # No need for gradients: ZeRO-Offload can transfer these gradients for each parameter individually or in small groups to the CPU memory immediately after they are computed
|
125 |
+
)
|
126 |
|
127 |
if zero_init:
|
128 |
+
cpu_mem = total_params * (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * gpus_factor * additional_buffer_factor
|
129 |
else:
|
130 |
+
cpu_mem = total_params * max(params_fac * num_gpus_per_node, (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * gpus_factor) * additional_buffer_factor
|
131 |
else:
|
132 |
+
if precision == "mixed":
|
133 |
+
gpu_mem = max(
|
134 |
+
int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * largest_layer_params),
|
135 |
+
int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
|
136 |
+
)
|
137 |
+
else:
|
138 |
+
gpu_mem = max(
|
139 |
+
int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * largest_layer_params),
|
140 |
+
int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * total_params / total_gpus)
|
141 |
+
)
|
142 |
|
143 |
if zero_init:
|
144 |
cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor
|