andstor commited on
Commit
674c962
1 Parent(s): 562c3cb

Add DeepSpeed ZeRO calculations

Browse files
Files changed (3) hide show
  1. src/app.py +75 -5
  2. src/model_utils.py +28 -11
  3. src/parallelism_utils.py +120 -0
src/app.py CHANGED
@@ -9,15 +9,27 @@ from huggingface_hub.utils import HfHubHTTPError
9
  MODEL = None
10
 
11
 
12
- def get_results(model_name: str, library: str, options: list, access_token: str):
13
  global MODEL
14
  MODEL = get_model(model_name, library, access_token)
15
  try:
16
  has_discussion = check_for_discussion(model_name)
17
  except HfHubHTTPError:
18
  has_discussion = True
19
- title = f"## Memory usage for '{model_name}'"
 
 
 
 
 
 
 
 
 
 
20
  data = calculate_memory(MODEL, options)
 
 
21
  return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
22
 
23
 
@@ -51,21 +63,79 @@ with gr.Blocks() as demo:
51
  inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
52
  with gr.Row():
53
  library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
54
- options = gr.CheckboxGroup(
55
- ["float32", "float16/bfloat16", "int8", "int4"],
56
  value="float32",
57
  label="Model Precision",
58
  )
 
 
 
 
 
59
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
 
 
 
 
 
 
 
 
 
 
60
  with gr.Row():
61
  btn = gr.Button("Calculate Memory Usage")
62
  post_to_hub = gr.Button(
63
  value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
64
  )
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  btn.click(
67
  get_results,
68
- inputs=[inp, library, options, access_token],
69
  outputs=[out_text, out, post_to_hub],
70
  )
71
 
 
9
  MODEL = None
10
 
11
 
12
+ def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list):
13
  global MODEL
14
  MODEL = get_model(model_name, library, access_token)
15
  try:
16
  has_discussion = check_for_discussion(model_name)
17
  except HfHubHTTPError:
18
  has_discussion = True
19
+
20
+ options = {
21
+ "precision": precision,
22
+ "zero_stage": zero_stage,
23
+ "cpu_offload": True if "Optimizer" in offloading else False,
24
+ "cpu_offload_params": True if "Parameters" in offloading else False,
25
+ "zero_init": True if "zero.Init" in zero_init else False,
26
+ "num_nodes": num_nodes,
27
+ "num_gpus_per_node": num_gpus,
28
+ "training_regime": training,
29
+ }
30
  data = calculate_memory(MODEL, options)
31
+
32
+ title = f"## Memory usage for '{model_name}'"
33
  return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
34
 
35
 
 
63
  inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
64
  with gr.Row():
65
  library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
66
+ precision = gr.CheckboxGroup(
67
+ ["float32", "float16/bfloat16"],
68
  value="float32",
69
  label="Model Precision",
70
  )
71
+ training = gr.Radio(
72
+ ["Mixed precision", "Single precision"],
73
+ value="Mixed precision",
74
+ label="Training Paradigm",
75
+ )
76
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
77
+ with gr.Row():
78
+ with gr.Column():
79
+ zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
80
+ zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
81
+ with gr.Row():
82
+ offloading = gr.CheckboxGroup(["Optimizer", "Parameters"], label="ZeRO-Offload", info="Offloading data and compute to CPU", value=["Optimizer", "Parameters"])
83
+ zero_init = gr.CheckboxGroup(["zero.Init"], value=True, label="Initialization")
84
+
85
+ num_gpus = gr.Number(label="GPUs per node", value=1, min=1, step=1)
86
+ num_nodes = gr.Number(label="Nodes", value=1, min=1, step=1)
87
  with gr.Row():
88
  btn = gr.Button("Calculate Memory Usage")
89
  post_to_hub = gr.Button(
90
  value="Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False
91
  )
92
 
93
+ def change_zero_settings(evt: gr.SelectData): # SelectData is a subclass of EventData
94
+ if evt.index == 0:
95
+ return [gr.update(visible = False), gr.update(visible = False)]
96
+ if evt.index == 1 or evt.index == 2:
97
+ return [gr.update(choices=["Optimizer"], visible=True), gr.update(visible = False)]
98
+ if evt.index == 3:
99
+ return [gr.update(choices=["Optimizer", "Parameters"], visible=True), gr.update(visible = True)]
100
+
101
+ def change_zero_description(evt: gr.SelectData): # SelectData is a subclass of EventData
102
+ if evt.index == 0:
103
+ return gr.update(value=None)
104
+ if evt.index == 1:
105
+ return gr.update(value=["Optimizer state"])
106
+ if evt.index == 2:
107
+ return gr.update(value=["Optimizer state", "Gradients"])
108
+ if evt.index == 3:
109
+ return gr.update(value=["Optimizer state", "Gradients", "Parameters"])
110
+
111
+ def change_offloading(evt: gr.SelectData, zero_stage): # SelectData is a subclass of EventData
112
+
113
+ if evt.value == "Optimizer" and evt.selected == False:
114
+ return gr.CheckboxGroup.update(choices=["Optimizer"], value=[])
115
+
116
+ if evt.value == "Optimizer" and evt.selected == True:
117
+ if zero_stage in [1, 2]:
118
+ return gr.CheckboxGroup.update(choices=["Optimizer"], value=["Optimizer"])
119
+ elif zero_stage == 3:
120
+ return gr.CheckboxGroup.update(choices=["Optimizer", "Parameters"], value=["Optimizer"])
121
+
122
+ if evt.value == "Parameters" and evt.selected == False:
123
+ return gr.CheckboxGroup.update(value=["Optimizer"])
124
+
125
+ if evt.value == "Parameters" and evt.selected == True:
126
+
127
+ return gr.CheckboxGroup.update(value=["Optimizer", "Parameters"])
128
+
129
+
130
+
131
+ zero_stage.select(change_zero_settings, None, [offloading, zero_init])
132
+ zero_stage.select(change_zero_description, None, zero_description)
133
+ offloading.select(change_offloading, zero_stage, offloading)
134
+
135
+
136
  btn.click(
137
  get_results,
138
+ inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init],
139
  outputs=[out_text, out, post_to_hub],
140
  )
141
 
src/model_utils.py CHANGED
@@ -6,9 +6,12 @@ import torch
6
  from accelerate.commands.estimate import check_has_model, create_empty_model
7
  from accelerate.utils import calculate_maximum_sizes, convert_bytes
8
  from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
 
9
 
10
 
11
  DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
 
 
12
 
13
 
14
  def extract_from_url(name: str):
@@ -74,12 +77,13 @@ def get_model(model_name: str, library: str, access_token: str):
74
  return model
75
 
76
 
77
- def calculate_memory(model: torch.nn.Module, options: list):
78
  "Calculates the memory usage for a model init on `meta` device"
79
  total_size, largest_layer = calculate_maximum_sizes(model)
 
80
 
81
  data = []
82
- for dtype in options:
83
  dtype_total_size = total_size
84
  dtype_largest_layer = largest_layer[0]
85
 
@@ -87,15 +91,28 @@ def calculate_memory(model: torch.nn.Module, options: list):
87
  dtype_total_size /= modifier
88
  dtype_largest_layer /= modifier
89
 
90
- dtype_training_size = convert_bytes(dtype_total_size * 4)
91
- dtype_total_size = convert_bytes(dtype_total_size)
92
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  data.append(
94
- {
95
- "dtype": dtype,
96
- "Largest Layer or Residual Group": dtype_largest_layer,
97
- "Total Size": dtype_total_size,
98
- "Training using Adam": dtype_training_size,
99
- }
100
- )
 
 
101
  return data
 
6
  from accelerate.commands.estimate import check_has_model, create_empty_model
7
  from accelerate.utils import calculate_maximum_sizes, convert_bytes
8
  from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
9
+ from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_zero2_model_states_mem_needs, estimate_zero3_model_states_mem_needs
10
 
11
 
12
  DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
13
+ PRECISION_FACTOR = {"Mixed precision": 2, "Single precision": 4}
14
+ DTYPE_FACTOR = {"float32": 4, "float16/bfloat16": 2}
15
 
16
 
17
  def extract_from_url(name: str):
 
77
  return model
78
 
79
 
80
+ def calculate_memory(model: torch.nn.Module, options: dict):
81
  "Calculates the memory usage for a model init on `meta` device"
82
  total_size, largest_layer = calculate_maximum_sizes(model)
83
+ total_params = model.num_parameters()
84
 
85
  data = []
86
+ for dtype in options["precision"]:
87
  dtype_total_size = total_size
88
  dtype_largest_layer = largest_layer[0]
89
 
 
91
  dtype_total_size /= modifier
92
  dtype_largest_layer /= modifier
93
 
 
 
94
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
95
+
96
+ precision_fac = PRECISION_FACTOR[options["training_regime"]]
97
+ params_fac = DTYPE_FACTOR[dtype]
98
+
99
+ if options["zero_stage"] == 0:
100
+ cpu_mem = dtype_total_size * 4
101
+ gpu_mem = cpu_mem
102
+ elif options["zero_stage"] == 1:
103
+ cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
104
+ elif options["zero_stage"] == 2:
105
+ cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
106
+ elif options["zero_stage"] == 3:
107
+ cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], precision_fac, params_fac)
108
  data.append(
109
+ {
110
+ "dtype": dtype,
111
+ "Largest Layer or Residual Group": dtype_largest_layer,
112
+ "Total Size": convert_bytes(dtype_total_size),
113
+ "per CPU": convert_bytes(cpu_mem),
114
+ "per GPU (Adam)": convert_bytes(gpu_mem),
115
+ }
116
+ )
117
+
118
  return data
src/parallelism_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Zero Redundancy Optimizer (ZeRO)
2
+ def estimate_zero1_model_states_mem_needs(total_params,
3
+ num_gpus_per_node=1,
4
+ num_nodes=1,
5
+ cpu_offload=True,
6
+ additional_buffer_factor=1.5,
7
+ precision_fac = 2, # half precision
8
+ params_fac = 4 # 4 bytes per float32 model parameter type
9
+ ):
10
+
11
+ # TODO: check if params_fac is needed during full fp32 training.
12
+ # Normally, mixed precision training results in 1.5x memory compared to FP32.
13
+ # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
14
+
15
+ total_gpus = num_nodes * num_gpus_per_node
16
+
17
+ master_params_fac = 4
18
+ variance_fac = 4
19
+ momentum_fac = 4
20
+ grads_fac = 4
21
+ optimizer_fac = variance_fac + momentum_fac # Adam optimizer
22
+
23
+ total_gpus = num_nodes * num_gpus_per_node
24
+
25
+ if cpu_offload:
26
+ gpu_mem = (precision_fac * total_params) + (precision_fac * total_params)
27
+ cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
28
+ else:
29
+ gpu_mem = (precision_fac * total_params) + (precision_fac * total_params) + int((precision_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
30
+ cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
31
+
32
+ return int(cpu_mem), int(gpu_mem)
33
+
34
+ def estimate_zero2_model_states_mem_needs(total_params,
35
+ num_gpus_per_node=1,
36
+ num_nodes=1,
37
+ cpu_offload=True,
38
+ additional_buffer_factor=1.5,
39
+ precision_fac = 2, # half precision
40
+ params_fac = 4 # 4 bytes per float32 model parameter type
41
+ ):
42
+
43
+ # TODO: check if params_fac is needed during full fp32 training.
44
+ # Normally, mixed precision training results in 1.5x memory compared to FP32.
45
+ # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
46
+
47
+ total_gpus = num_nodes * num_gpus_per_node
48
+
49
+ master_params_fac = 4
50
+ variance_fac = 4
51
+ momentum_fac = 4
52
+ grads_fac = 4
53
+ optimizer_fac = variance_fac + momentum_fac # Adam optimizer
54
+
55
+ total_gpus = num_nodes * num_gpus_per_node
56
+
57
+ if cpu_offload:
58
+ gpu_mem = precision_fac * total_params
59
+ cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
60
+ else:
61
+ gpu_mem = precision_fac * total_params + int((precision_fac + grads_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
62
+ cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
63
+
64
+ return int(cpu_mem), int(gpu_mem)
65
+
66
+
67
+ def estimate_zero3_model_states_mem_needs(total_params,
68
+ largest_layer_params,
69
+ num_gpus_per_node=1,
70
+ num_nodes=1,
71
+ cpu_offload=True,
72
+ cpu_offload_params=True,
73
+ zero_init=True,
74
+ additional_buffer_factor=1.5,
75
+ precision_fac = 2, # half precision
76
+ params_fac = 4 # 4 bytes per float32 model parameter type
77
+ ):
78
+
79
+ # TODO: check if params_fac is needed during full fp32 training.
80
+ # Normally, mixed precision training results in 1.5x memory compared to FP32.
81
+ # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
82
+
83
+ total_gpus = num_nodes * num_gpus_per_node
84
+ gpus_factor = 1 / num_nodes
85
+ master_params_fac = 4
86
+ variance_fac = 4
87
+ momentum_fac = 4
88
+ grads_fac = 4
89
+ optimizer_fac = variance_fac + momentum_fac # Adam optimizer
90
+
91
+ largest_layer_memory = (2 * precision_fac) * largest_layer_params # params + grads = (2 * modifier)
92
+
93
+ if cpu_offload:
94
+ if cpu_offload_params:
95
+ gpu_mem = largest_layer_memory
96
+
97
+ if zero_init:
98
+ cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor * additional_buffer_factor
99
+ else:
100
+
101
+ cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor) * additional_buffer_factor
102
+ else:
103
+ gpu_mem = largest_layer_memory + int(precision_fac * total_params / total_gpus)
104
+
105
+ if zero_init:
106
+ cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac) * gpus_factor * additional_buffer_factor
107
+ else:
108
+ cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac) * gpus_factor) * additional_buffer_factor
109
+ else:
110
+ gpu_mem = largest_layer_memory + int((master_params_fac + grads_fac + optimizer_fac + precision_fac) * total_params / total_gpus)
111
+ # 2b for fp16 params, 4b master params, 4b grads, 4b momentum and 4b variance per parameter = 18
112
+
113
+ if zero_init:
114
+ cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor
115
+ else:
116
+ cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
117
+
118
+ return int(cpu_mem), int(gpu_mem), largest_layer_memory
119
+
120
+