Add DeepSpeed ZeRO calculations
Browse files- src/app.py +75 -5
- src/model_utils.py +28 -11
- src/parallelism_utils.py +120 -0
src/app.py
CHANGED
@@ -9,15 +9,27 @@ from huggingface_hub.utils import HfHubHTTPError
|
|
9 |
MODEL = None
|
10 |
|
11 |
|
12 |
-
def get_results(model_name: str, library: str,
|
13 |
global MODEL
|
14 |
MODEL = get_model(model_name, library, access_token)
|
15 |
try:
|
16 |
has_discussion = check_for_discussion(model_name)
|
17 |
except HfHubHTTPError:
|
18 |
has_discussion = True
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
data = calculate_memory(MODEL, options)
|
|
|
|
|
21 |
return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
|
22 |
|
23 |
|
@@ -51,21 +63,79 @@ with gr.Blocks() as demo:
|
|
51 |
inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
|
52 |
with gr.Row():
|
53 |
library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
|
54 |
-
|
55 |
-
["float32", "float16/bfloat16"
|
56 |
value="float32",
|
57 |
label="Model Precision",
|
58 |
)
|
|
|
|
|
|
|
|
|
|
|
59 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
with gr.Row():
|
61 |
btn = gr.Button("Calculate Memory Usage")
|
62 |
post_to_hub = gr.Button(
|
63 |
value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
|
64 |
)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
btn.click(
|
67 |
get_results,
|
68 |
-
inputs=[inp, library,
|
69 |
outputs=[out_text, out, post_to_hub],
|
70 |
)
|
71 |
|
|
|
9 |
MODEL = None
|
10 |
|
11 |
|
12 |
+
def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list):
|
13 |
global MODEL
|
14 |
MODEL = get_model(model_name, library, access_token)
|
15 |
try:
|
16 |
has_discussion = check_for_discussion(model_name)
|
17 |
except HfHubHTTPError:
|
18 |
has_discussion = True
|
19 |
+
|
20 |
+
options = {
|
21 |
+
"precision": precision,
|
22 |
+
"zero_stage": zero_stage,
|
23 |
+
"cpu_offload": True if "Optimizer" in offloading else False,
|
24 |
+
"cpu_offload_params": True if "Parameters" in offloading else False,
|
25 |
+
"zero_init": True if "zero.Init" in zero_init else False,
|
26 |
+
"num_nodes": num_nodes,
|
27 |
+
"num_gpus_per_node": num_gpus,
|
28 |
+
"training_regime": training,
|
29 |
+
}
|
30 |
data = calculate_memory(MODEL, options)
|
31 |
+
|
32 |
+
title = f"## Memory usage for '{model_name}'"
|
33 |
return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
|
34 |
|
35 |
|
|
|
63 |
inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
|
64 |
with gr.Row():
|
65 |
library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
|
66 |
+
precision = gr.CheckboxGroup(
|
67 |
+
["float32", "float16/bfloat16"],
|
68 |
value="float32",
|
69 |
label="Model Precision",
|
70 |
)
|
71 |
+
training = gr.Radio(
|
72 |
+
["Mixed precision", "Single precision"],
|
73 |
+
value="Mixed precision",
|
74 |
+
label="Training Paradigm",
|
75 |
+
)
|
76 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
77 |
+
with gr.Row():
|
78 |
+
with gr.Column():
|
79 |
+
zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
|
80 |
+
zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
|
81 |
+
with gr.Row():
|
82 |
+
offloading = gr.CheckboxGroup(["Optimizer", "Parameters"], label="ZeRO-Offload", info="Offloading data and compute to CPU", value=["Optimizer", "Parameters"])
|
83 |
+
zero_init = gr.CheckboxGroup(["zero.Init"], value=True, label="Initialization")
|
84 |
+
|
85 |
+
num_gpus = gr.Number(label="GPUs per node", value=1, min=1, step=1)
|
86 |
+
num_nodes = gr.Number(label="Nodes", value=1, min=1, step=1)
|
87 |
with gr.Row():
|
88 |
btn = gr.Button("Calculate Memory Usage")
|
89 |
post_to_hub = gr.Button(
|
90 |
value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
|
91 |
)
|
92 |
|
93 |
+
def change_zero_settings(evt: gr.SelectData): # SelectData is a subclass of EventData
|
94 |
+
if evt.index == 0:
|
95 |
+
return [gr.update(visible = False), gr.update(visible = False)]
|
96 |
+
if evt.index == 1 or evt.index == 2:
|
97 |
+
return [gr.update(choices=["Optimizer"], visible=True), gr.update(visible = False)]
|
98 |
+
if evt.index == 3:
|
99 |
+
return [gr.update(choices=["Optimizer", "Parameters"], visible=True), gr.update(visible = True)]
|
100 |
+
|
101 |
+
def change_zero_description(evt: gr.SelectData): # SelectData is a subclass of EventData
|
102 |
+
if evt.index == 0:
|
103 |
+
return gr.update(value=None)
|
104 |
+
if evt.index == 1:
|
105 |
+
return gr.update(value=["Optimizer state"])
|
106 |
+
if evt.index == 2:
|
107 |
+
return gr.update(value=["Optimizer state", "Gradients"])
|
108 |
+
if evt.index == 3:
|
109 |
+
return gr.update(value=["Optimizer state", "Gradients", "Parameters"])
|
110 |
+
|
111 |
+
def change_offloading(evt: gr.SelectData, zero_stage): # SelectData is a subclass of EventData
|
112 |
+
|
113 |
+
if evt.value == "Optimizer" and evt.selected == False:
|
114 |
+
return gr.CheckboxGroup.update(choices=["Optimizer"], value=[])
|
115 |
+
|
116 |
+
if evt.value == "Optimizer" and evt.selected == True:
|
117 |
+
if zero_stage in [1, 2]:
|
118 |
+
return gr.CheckboxGroup.update(choices=["Optimizer"], value=["Optimizer"])
|
119 |
+
elif zero_stage == 3:
|
120 |
+
return gr.CheckboxGroup.update(choices=["Optimizer", "Parameters"], value=["Optimizer"])
|
121 |
+
|
122 |
+
if evt.value == "Parameters" and evt.selected == False:
|
123 |
+
return gr.CheckboxGroup.update(value=["Optimizer"])
|
124 |
+
|
125 |
+
if evt.value == "Parameters" and evt.selected == True:
|
126 |
+
|
127 |
+
return gr.CheckboxGroup.update(value=["Optimizer", "Parameters"])
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
zero_stage.select(change_zero_settings, None, [offloading, zero_init])
|
132 |
+
zero_stage.select(change_zero_description, None, zero_description)
|
133 |
+
offloading.select(change_offloading, zero_stage, offloading)
|
134 |
+
|
135 |
+
|
136 |
btn.click(
|
137 |
get_results,
|
138 |
+
inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init],
|
139 |
outputs=[out_text, out, post_to_hub],
|
140 |
)
|
141 |
|
src/model_utils.py
CHANGED
@@ -6,9 +6,12 @@ import torch
|
|
6 |
from accelerate.commands.estimate import check_has_model, create_empty_model
|
7 |
from accelerate.utils import calculate_maximum_sizes, convert_bytes
|
8 |
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
|
|
9 |
|
10 |
|
11 |
DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
|
|
|
|
|
12 |
|
13 |
|
14 |
def extract_from_url(name: str):
|
@@ -74,12 +77,13 @@ def get_model(model_name: str, library: str, access_token: str):
|
|
74 |
return model
|
75 |
|
76 |
|
77 |
-
def calculate_memory(model: torch.nn.Module, options:
|
78 |
"Calculates the memory usage for a model init on `meta` device"
|
79 |
total_size, largest_layer = calculate_maximum_sizes(model)
|
|
|
80 |
|
81 |
data = []
|
82 |
-
for dtype in options:
|
83 |
dtype_total_size = total_size
|
84 |
dtype_largest_layer = largest_layer[0]
|
85 |
|
@@ -87,15 +91,28 @@ def calculate_memory(model: torch.nn.Module, options: list):
|
|
87 |
dtype_total_size /= modifier
|
88 |
dtype_largest_layer /= modifier
|
89 |
|
90 |
-
dtype_training_size = convert_bytes(dtype_total_size * 4)
|
91 |
-
dtype_total_size = convert_bytes(dtype_total_size)
|
92 |
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
data.append(
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
101 |
return data
|
|
|
6 |
from accelerate.commands.estimate import check_has_model, create_empty_model
|
7 |
from accelerate.utils import calculate_maximum_sizes, convert_bytes
|
8 |
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
9 |
+
from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_zero2_model_states_mem_needs, estimate_zero3_model_states_mem_needs
|
10 |
|
11 |
|
12 |
DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
|
13 |
+
PRECISION_FACTOR = {"Mixed precision": 2, "Single precision": 4}
|
14 |
+
DTYPE_FACTOR = {"float32": 4, "float16/bfloat16": 2}
|
15 |
|
16 |
|
17 |
def extract_from_url(name: str):
|
|
|
77 |
return model
|
78 |
|
79 |
|
80 |
+
def calculate_memory(model: torch.nn.Module, options: dict):
|
81 |
"Calculates the memory usage for a model init on `meta` device"
|
82 |
total_size, largest_layer = calculate_maximum_sizes(model)
|
83 |
+
total_params = model.num_parameters()
|
84 |
|
85 |
data = []
|
86 |
+
for dtype in options["precision"]:
|
87 |
dtype_total_size = total_size
|
88 |
dtype_largest_layer = largest_layer[0]
|
89 |
|
|
|
91 |
dtype_total_size /= modifier
|
92 |
dtype_largest_layer /= modifier
|
93 |
|
|
|
|
|
94 |
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
95 |
+
|
96 |
+
precision_fac = PRECISION_FACTOR[options["training_regime"]]
|
97 |
+
params_fac = DTYPE_FACTOR[dtype]
|
98 |
+
|
99 |
+
if options["zero_stage"] == 0:
|
100 |
+
cpu_mem = dtype_total_size * 4
|
101 |
+
gpu_mem = cpu_mem
|
102 |
+
elif options["zero_stage"] == 1:
|
103 |
+
cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
|
104 |
+
elif options["zero_stage"] == 2:
|
105 |
+
cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
|
106 |
+
elif options["zero_stage"] == 3:
|
107 |
+
cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], precision_fac, params_fac)
|
108 |
data.append(
|
109 |
+
{
|
110 |
+
"dtype": dtype,
|
111 |
+
"Largest Layer or Residual Group": dtype_largest_layer,
|
112 |
+
"Total Size": convert_bytes(dtype_total_size),
|
113 |
+
"per CPU": convert_bytes(cpu_mem),
|
114 |
+
"per GPU (Adam)": convert_bytes(gpu_mem),
|
115 |
+
}
|
116 |
+
)
|
117 |
+
|
118 |
return data
|
src/parallelism_utils.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Zero Redundancy Optimizer (ZeRO)
|
2 |
+
def estimate_zero1_model_states_mem_needs(total_params,
|
3 |
+
num_gpus_per_node=1,
|
4 |
+
num_nodes=1,
|
5 |
+
cpu_offload=True,
|
6 |
+
additional_buffer_factor=1.5,
|
7 |
+
precision_fac = 2, # half precision
|
8 |
+
params_fac = 4 # 4 bytes per float32 model parameter type
|
9 |
+
):
|
10 |
+
|
11 |
+
# TODO: check if params_fac is needed during full fp32 training.
|
12 |
+
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
13 |
+
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
14 |
+
|
15 |
+
total_gpus = num_nodes * num_gpus_per_node
|
16 |
+
|
17 |
+
master_params_fac = 4
|
18 |
+
variance_fac = 4
|
19 |
+
momentum_fac = 4
|
20 |
+
grads_fac = 4
|
21 |
+
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
22 |
+
|
23 |
+
total_gpus = num_nodes * num_gpus_per_node
|
24 |
+
|
25 |
+
if cpu_offload:
|
26 |
+
gpu_mem = (precision_fac * total_params) + (precision_fac * total_params)
|
27 |
+
cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
|
28 |
+
else:
|
29 |
+
gpu_mem = (precision_fac * total_params) + (precision_fac * total_params) + int((precision_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
|
30 |
+
cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
31 |
+
|
32 |
+
return int(cpu_mem), int(gpu_mem)
|
33 |
+
|
34 |
+
def estimate_zero2_model_states_mem_needs(total_params,
|
35 |
+
num_gpus_per_node=1,
|
36 |
+
num_nodes=1,
|
37 |
+
cpu_offload=True,
|
38 |
+
additional_buffer_factor=1.5,
|
39 |
+
precision_fac = 2, # half precision
|
40 |
+
params_fac = 4 # 4 bytes per float32 model parameter type
|
41 |
+
):
|
42 |
+
|
43 |
+
# TODO: check if params_fac is needed during full fp32 training.
|
44 |
+
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
45 |
+
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
46 |
+
|
47 |
+
total_gpus = num_nodes * num_gpus_per_node
|
48 |
+
|
49 |
+
master_params_fac = 4
|
50 |
+
variance_fac = 4
|
51 |
+
momentum_fac = 4
|
52 |
+
grads_fac = 4
|
53 |
+
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
54 |
+
|
55 |
+
total_gpus = num_nodes * num_gpus_per_node
|
56 |
+
|
57 |
+
if cpu_offload:
|
58 |
+
gpu_mem = precision_fac * total_params
|
59 |
+
cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
|
60 |
+
else:
|
61 |
+
gpu_mem = precision_fac * total_params + int((precision_fac + grads_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
|
62 |
+
cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
63 |
+
|
64 |
+
return int(cpu_mem), int(gpu_mem)
|
65 |
+
|
66 |
+
|
67 |
+
def estimate_zero3_model_states_mem_needs(total_params,
|
68 |
+
largest_layer_params,
|
69 |
+
num_gpus_per_node=1,
|
70 |
+
num_nodes=1,
|
71 |
+
cpu_offload=True,
|
72 |
+
cpu_offload_params=True,
|
73 |
+
zero_init=True,
|
74 |
+
additional_buffer_factor=1.5,
|
75 |
+
precision_fac = 2, # half precision
|
76 |
+
params_fac = 4 # 4 bytes per float32 model parameter type
|
77 |
+
):
|
78 |
+
|
79 |
+
# TODO: check if params_fac is needed during full fp32 training.
|
80 |
+
# Normally, mixed precision training results in 1.5x memory compared to FP32.
|
81 |
+
# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
|
82 |
+
|
83 |
+
total_gpus = num_nodes * num_gpus_per_node
|
84 |
+
gpus_factor = 1 / num_nodes
|
85 |
+
master_params_fac = 4
|
86 |
+
variance_fac = 4
|
87 |
+
momentum_fac = 4
|
88 |
+
grads_fac = 4
|
89 |
+
optimizer_fac = variance_fac + momentum_fac # Adam optimizer
|
90 |
+
|
91 |
+
largest_layer_memory = (2 * precision_fac) * largest_layer_params # params + grads = (2 * modifier)
|
92 |
+
|
93 |
+
if cpu_offload:
|
94 |
+
if cpu_offload_params:
|
95 |
+
gpu_mem = largest_layer_memory
|
96 |
+
|
97 |
+
if zero_init:
|
98 |
+
cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor * additional_buffer_factor
|
99 |
+
else:
|
100 |
+
|
101 |
+
cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor) * additional_buffer_factor
|
102 |
+
else:
|
103 |
+
gpu_mem = largest_layer_memory + int(precision_fac * total_params / total_gpus)
|
104 |
+
|
105 |
+
if zero_init:
|
106 |
+
cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac) * gpus_factor * additional_buffer_factor
|
107 |
+
else:
|
108 |
+
cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac) * gpus_factor) * additional_buffer_factor
|
109 |
+
else:
|
110 |
+
gpu_mem = largest_layer_memory + int((master_params_fac + grads_fac + optimizer_fac + precision_fac) * total_params / total_gpus)
|
111 |
+
# 2b for fp16 params, 4b master params, 4b grads, 4b momentum and 4b variance per parameter = 18
|
112 |
+
|
113 |
+
if zero_init:
|
114 |
+
cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
115 |
+
else:
|
116 |
+
cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
|
117 |
+
|
118 |
+
return int(cpu_mem), int(gpu_mem), largest_layer_memory
|
119 |
+
|
120 |
+
|