andstor commited on
Commit
964360b
1 Parent(s): 674c962

Update calculations

Browse files
Files changed (3) hide show
  1. src/app.py +17 -11
  2. src/model_utils.py +10 -10
  3. src/parallelism_utils.py +87 -57
src/app.py CHANGED
@@ -9,7 +9,7 @@ from huggingface_hub.utils import HfHubHTTPError
9
  MODEL = None
10
 
11
 
12
- def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list):
13
  global MODEL
14
  MODEL = get_model(model_name, library, access_token)
15
  try:
@@ -26,6 +26,7 @@ def get_results(model_name: str, library: str, precision: list, training: list,
26
  "num_nodes": num_nodes,
27
  "num_gpus_per_node": num_gpus,
28
  "training_regime": training,
 
29
  }
30
  data = calculate_memory(MODEL, options)
31
 
@@ -36,7 +37,7 @@ def get_results(model_name: str, library: str, precision: list, training: list,
36
  with gr.Blocks() as demo:
37
  with gr.Column():
38
  gr.Markdown(
39
- """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
40
 
41
  This tool will help you calculate how much vRAM is needed to train and perform big model inference
42
  on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
@@ -74,16 +75,21 @@ with gr.Blocks() as demo:
74
  label="Training Paradigm",
75
  )
76
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
77
- with gr.Row():
78
- with gr.Column():
 
 
 
79
  zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
80
  zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
81
- with gr.Row():
82
- offloading = gr.CheckboxGroup(["Optimizer", "Parameters"], label="ZeRO-Offload", info="Offloading data and compute to CPU", value=["Optimizer", "Parameters"])
83
- zero_init = gr.CheckboxGroup(["zero.Init"], value=True, label="Initialization")
84
-
85
- num_gpus = gr.Number(label="GPUs per node", value=1, min=1, step=1)
86
- num_nodes = gr.Number(label="Nodes", value=1, min=1, step=1)
 
 
87
  with gr.Row():
88
  btn = gr.Button("Calculate Memory Usage")
89
  post_to_hub = gr.Button(
@@ -135,7 +141,7 @@ with gr.Blocks() as demo:
135
 
136
  btn.click(
137
  get_results,
138
- inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init],
139
  outputs=[out_text, out, post_to_hub],
140
  )
141
 
 
9
  MODEL = None
10
 
11
 
12
+ def get_results(model_name: str, library: str, precision: list, training: list, access_token: str, zero_stage: int, num_nodes: int, num_gpus: int, offloading: list, zero_init: list, additional_buffer_factor: float):
13
  global MODEL
14
  MODEL = get_model(model_name, library, access_token)
15
  try:
 
26
  "num_nodes": num_nodes,
27
  "num_gpus_per_node": num_gpus,
28
  "training_regime": training,
29
+ "additional_buffer_factor": additional_buffer_factor
30
  }
31
  data = calculate_memory(MODEL, options)
32
 
 
37
  with gr.Blocks() as demo:
38
  with gr.Column():
39
  gr.Markdown(
40
+ """<img src="https://huggingface.co/spaces/andstor/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 DeepSpeed Model Memory Calculator</h1>
41
 
42
  This tool will help you calculate how much vRAM is needed to train and perform big model inference
43
  on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
 
75
  label="Training Paradigm",
76
  )
77
  access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
78
+ num_gpus = gr.Number(label="GPUs per node", value=4, minimum=1, step=1)
79
+ num_nodes = gr.Number(label="Nodes", value=1, minimum=1, step=1)
80
+ with gr.Column(variant="panel"):
81
+ with gr.Row(equal_height=True):
82
+
83
  zero_stage = gr.Radio(["Stage 0", "Stage 1", "Stage 2", "Stage 3"], label="ZeRO Stage", value="Stage 3", type="index")
84
  zero_description = gr.CheckboxGroup(["Optimizer state", "Gradients", "Parameters"], label="Partitioning", value=["Optimizer state", "Gradients", "Parameters"], interactive=False)
85
+
86
+ with gr.Row(equal_height=True):
87
+ #with gr.Column():
88
+ offloading = gr.CheckboxGroup(["Optimizer", "Parameters"], label="ZeRO-Offload", info="Offloading data and compute to CPU", value=["Optimizer", "Parameters"])
89
+ zero_init = gr.CheckboxGroup(["zero.Init"], value=["zero.Init"], label="Initialization")
90
+
91
+ #with gr.Column():
92
+ additional_buffer_factor = gr.Number(label="Additional Buffer Factor", value=1.5, minimum=1, step=0.1)
93
  with gr.Row():
94
  btn = gr.Button("Calculate Memory Usage")
95
  post_to_hub = gr.Button(
 
141
 
142
  btn.click(
143
  get_results,
144
+ inputs=[inp, library, precision, training, access_token, zero_stage, num_nodes, num_gpus, offloading, zero_init, additional_buffer_factor],
145
  outputs=[out_text, out, post_to_hub],
146
  )
147
 
src/model_utils.py CHANGED
@@ -10,8 +10,8 @@ from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_ze
10
 
11
 
12
  DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
13
- PRECISION_FACTOR = {"Mixed precision": 2, "Single precision": 4}
14
- DTYPE_FACTOR = {"float32": 4, "float16/bfloat16": 2}
15
 
16
 
17
  def extract_from_url(name: str):
@@ -93,25 +93,25 @@ def calculate_memory(model: torch.nn.Module, options: dict):
93
 
94
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
95
 
96
- precision_fac = PRECISION_FACTOR[options["training_regime"]]
97
- params_fac = DTYPE_FACTOR[dtype]
98
 
99
  if options["zero_stage"] == 0:
100
  cpu_mem = dtype_total_size * 4
101
  gpu_mem = cpu_mem
102
  elif options["zero_stage"] == 1:
103
- cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
104
  elif options["zero_stage"] == 2:
105
- cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], precision_fac, params_fac)
106
  elif options["zero_stage"] == 3:
107
- cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], precision_fac, params_fac)
108
  data.append(
109
  {
110
- "dtype": dtype,
111
  "Largest Layer or Residual Group": dtype_largest_layer,
112
- "Total Size": convert_bytes(dtype_total_size),
113
  "per CPU": convert_bytes(cpu_mem),
114
- "per GPU (Adam)": convert_bytes(gpu_mem),
115
  }
116
  )
117
 
 
10
 
11
 
12
  DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
13
+ PRECISION = {"Mixed precision": "mixed", "Single precision": "single"}
14
+ DTYPE = {"float32": torch.float32, "float16/bfloat16": torch.float16}
15
 
16
 
17
  def extract_from_url(name: str):
 
93
 
94
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
95
 
96
+ precision = PRECISION[options["training_regime"]]
97
+ model_dtype = DTYPE[dtype]
98
 
99
  if options["zero_stage"] == 0:
100
  cpu_mem = dtype_total_size * 4
101
  gpu_mem = cpu_mem
102
  elif options["zero_stage"] == 1:
103
+ cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
104
  elif options["zero_stage"] == 2:
105
+ cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
106
  elif options["zero_stage"] == 3:
107
+ cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], options["additional_buffer_factor"], precision, model_dtype)
108
  data.append(
109
  {
110
+ "Model dtype": dtype,
111
  "Largest Layer or Residual Group": dtype_largest_layer,
112
+ "Model Size": convert_bytes(dtype_total_size),
113
  "per CPU": convert_bytes(cpu_mem),
114
+ "per GPU": convert_bytes(gpu_mem),
115
  }
116
  )
117
 
src/parallelism_utils.py CHANGED
@@ -1,65 +1,90 @@
1
- # Zero Redundancy Optimizer (ZeRO)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  def estimate_zero1_model_states_mem_needs(total_params,
3
  num_gpus_per_node=1,
4
  num_nodes=1,
5
  cpu_offload=True,
6
  additional_buffer_factor=1.5,
7
- precision_fac = 2, # half precision
8
- params_fac = 4 # 4 bytes per float32 model parameter type
9
  ):
10
 
11
- # TODO: check if params_fac is needed during full fp32 training.
12
- # Normally, mixed precision training results in 1.5x memory compared to FP32.
13
- # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
14
-
15
- total_gpus = num_nodes * num_gpus_per_node
16
-
17
- master_params_fac = 4
18
- variance_fac = 4
19
- momentum_fac = 4
20
- grads_fac = 4
21
- optimizer_fac = variance_fac + momentum_fac # Adam optimizer
22
-
23
  total_gpus = num_nodes * num_gpus_per_node
 
 
 
24
 
25
  if cpu_offload:
26
- gpu_mem = (precision_fac * total_params) + (precision_fac * total_params)
27
- cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
28
  else:
29
- gpu_mem = (precision_fac * total_params) + (precision_fac * total_params) + int((precision_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
 
 
 
30
  cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
31
 
32
  return int(cpu_mem), int(gpu_mem)
33
 
 
34
  def estimate_zero2_model_states_mem_needs(total_params,
35
  num_gpus_per_node=1,
36
  num_nodes=1,
37
  cpu_offload=True,
38
  additional_buffer_factor=1.5,
39
- precision_fac = 2, # half precision
40
- params_fac = 4 # 4 bytes per float32 model parameter type
41
  ):
42
-
43
- # TODO: check if params_fac is needed during full fp32 training.
44
- # Normally, mixed precision training results in 1.5x memory compared to FP32.
45
- # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
46
 
47
  total_gpus = num_nodes * num_gpus_per_node
48
-
49
- master_params_fac = 4
50
- variance_fac = 4
51
- momentum_fac = 4
52
- grads_fac = 4
53
- optimizer_fac = variance_fac + momentum_fac # Adam optimizer
54
-
55
- total_gpus = num_nodes * num_gpus_per_node
56
 
57
  if cpu_offload:
58
- gpu_mem = precision_fac * total_params
59
- cpu_mem = total_params * max(params_fac * total_gpus, (master_params_fac+optimizer_fac+grads_fac)) * additional_buffer_factor
60
  else:
61
- gpu_mem = precision_fac * total_params + int((precision_fac + grads_fac + optimizer_fac + master_params_fac + precision_fac) * total_params / total_gpus)
62
- cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
 
 
 
63
 
64
  return int(cpu_mem), int(gpu_mem)
65
 
@@ -72,43 +97,48 @@ def estimate_zero3_model_states_mem_needs(total_params,
72
  cpu_offload_params=True,
73
  zero_init=True,
74
  additional_buffer_factor=1.5,
75
- precision_fac = 2, # half precision
76
- params_fac = 4 # 4 bytes per float32 model parameter type
77
  ):
78
 
79
- # TODO: check if params_fac is needed during full fp32 training.
80
- # Normally, mixed precision training results in 1.5x memory compared to FP32.
81
- # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
82
-
83
  total_gpus = num_nodes * num_gpus_per_node
84
  gpus_factor = 1 / num_nodes
85
- master_params_fac = 4
86
- variance_fac = 4
87
- momentum_fac = 4
88
- grads_fac = 4
89
- optimizer_fac = variance_fac + momentum_fac # Adam optimizer
90
 
91
- largest_layer_memory = (2 * precision_fac) * largest_layer_params # params + grads = (2 * modifier)
 
 
 
 
92
 
93
  if cpu_offload:
94
  if cpu_offload_params:
95
  gpu_mem = largest_layer_memory
96
-
97
  if zero_init:
98
- cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor * additional_buffer_factor
99
  else:
100
-
101
- cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac + params_fac) * gpus_factor) * additional_buffer_factor
102
  else:
103
- gpu_mem = largest_layer_memory + int(precision_fac * total_params / total_gpus)
 
 
 
104
 
105
  if zero_init:
106
- cpu_mem = total_params * (master_params_fac + grads_fac + optimizer_fac) * gpus_factor * additional_buffer_factor
107
  else:
108
- cpu_mem = total_params * max(params_fac * num_gpus_per_node, (master_params_fac + grads_fac + optimizer_fac) * gpus_factor) * additional_buffer_factor
109
  else:
110
- gpu_mem = largest_layer_memory + int((master_params_fac + grads_fac + optimizer_fac + precision_fac) * total_params / total_gpus)
111
- # 2b for fp16 params, 4b master params, 4b grads, 4b momentum and 4b variance per parameter = 18
 
 
 
 
 
 
 
 
112
 
113
  if zero_init:
114
  cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor
 
1
+ import torch
2
+
3
+ def get_precision_fac(precision: str):
4
+ if precision == "mixed":
5
+ return 2
6
+ elif precision == "single":
7
+ return 4
8
+ else:
9
+ raise ValueError("Precision must be either 'mixed' or 'single'")
10
+
11
+
12
+ def get_params_fac(model_dtype: torch.dtype):
13
+ if model_dtype == torch.float16:
14
+ return 2
15
+ elif model_dtype == torch.float32:
16
+ return 4
17
+ else:
18
+ raise ValueError("Model dtype must be either torch.float16 or torch.float32")
19
+
20
+
21
+
22
+ ####################### Zero Redundancy Optimizer (ZeRO) #######################
23
+
24
+ VARIANCE_FACTOR = 4
25
+ MOMENTUM_FACTOR = 4
26
+ OPTIMIZER_FACTOR = VARIANCE_FACTOR + MOMENTUM_FACTOR # Adam optimizer
27
+ FP32_GRADS_FACTOR = 4
28
+ FP32_PARAM_FACTOR = 4
29
+ MASTER_PARAMS_FACTOR = FP32_PARAM_FACTOR
30
+
31
+
32
+ # TODO: check if params_fac is needed during full fp32 training.
33
+ # Normally, mixed precision training results in 1.5x memory compared to FP32.
34
+ # Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
35
+
36
+
37
+
38
  def estimate_zero1_model_states_mem_needs(total_params,
39
  num_gpus_per_node=1,
40
  num_nodes=1,
41
  cpu_offload=True,
42
  additional_buffer_factor=1.5,
43
+ precision="mixed",
44
+ model_dtype = torch.float16,
45
  ):
46
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  total_gpus = num_nodes * num_gpus_per_node
48
+
49
+ precision_fac = get_precision_fac(precision)
50
+ params_fac = get_params_fac(model_dtype)
51
 
52
  if cpu_offload:
53
+ gpu_mem = (precision_fac * total_params) # + (grads_fac * total_params)
54
+ cpu_mem = total_params * max(params_fac * total_gpus, (MASTER_PARAMS_FACTOR + OPTIMIZER_FACTOR + FP32_GRADS_FACTOR)) * additional_buffer_factor
55
  else:
56
+ if precision == "mixed":
57
+ gpu_mem = (precision_fac * total_params) + (FP32_GRADS_FACTOR * total_params) + int((OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
58
+ else:
59
+ gpu_mem = (precision_fac * total_params) + (FP32_GRADS_FACTOR * total_params) + int(OPTIMIZER_FACTOR * total_params / total_gpus)
60
  cpu_mem = total_params * params_fac * num_gpus_per_node * additional_buffer_factor
61
 
62
  return int(cpu_mem), int(gpu_mem)
63
 
64
+
65
  def estimate_zero2_model_states_mem_needs(total_params,
66
  num_gpus_per_node=1,
67
  num_nodes=1,
68
  cpu_offload=True,
69
  additional_buffer_factor=1.5,
70
+ precision="mixed",
71
+ model_dtype = torch.float16,
72
  ):
 
 
 
 
73
 
74
  total_gpus = num_nodes * num_gpus_per_node
75
+
76
+ precision_fac = get_precision_fac(precision)
77
+ params_fac = get_params_fac(model_dtype)
 
 
 
 
 
78
 
79
  if cpu_offload:
80
+ gpu_mem = precision_fac * total_params # Negligible memory usage for partitioned gradients
81
+ cpu_mem = total_params * max(params_fac * total_gpus, (MASTER_PARAMS_FACTOR + OPTIMIZER_FACTOR + FP32_GRADS_FACTOR)) * additional_buffer_factor
82
  else:
83
+ if precision == "mixed":
84
+ gpu_mem = precision_fac * total_params + int((FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
85
+ else:
86
+ gpu_mem = precision_fac * total_params + int((FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * total_params / total_gpus)
87
+ cpu_mem = params_fac * total_params * num_gpus_per_node * additional_buffer_factor
88
 
89
  return int(cpu_mem), int(gpu_mem)
90
 
 
97
  cpu_offload_params=True,
98
  zero_init=True,
99
  additional_buffer_factor=1.5,
100
+ precision="mixed",
101
+ model_dtype = torch.float16,
102
  ):
103
 
 
 
 
 
104
  total_gpus = num_nodes * num_gpus_per_node
105
  gpus_factor = 1 / num_nodes
 
 
 
 
 
106
 
107
+ precision_fac = get_precision_fac(precision)
108
+ params_fac = get_params_fac(model_dtype)
109
+ grads_fac = precision_fac
110
+
111
+ largest_layer_memory = (grads_fac + precision_fac) * largest_layer_params
112
 
113
  if cpu_offload:
114
  if cpu_offload_params:
115
  gpu_mem = largest_layer_memory
 
116
  if zero_init:
117
+ cpu_mem = total_params * (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + params_fac) * gpus_factor * additional_buffer_factor
118
  else:
119
+ cpu_mem = total_params * max(params_fac * num_gpus_per_node, (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + params_fac) * gpus_factor) * additional_buffer_factor
120
+
121
  else:
122
+ gpu_mem = max(
123
+ largest_layer_memory,
124
+ int((precision_fac) * total_params / total_gpus) # No need for gradients: ZeRO-Offload can transfer these gradients for each parameter individually or in small groups to the CPU memory immediately after they are computed
125
+ )
126
 
127
  if zero_init:
128
+ cpu_mem = total_params * (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * gpus_factor * additional_buffer_factor
129
  else:
130
+ cpu_mem = total_params * max(params_fac * num_gpus_per_node, (MASTER_PARAMS_FACTOR + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * gpus_factor) * additional_buffer_factor
131
  else:
132
+ if precision == "mixed":
133
+ gpu_mem = max(
134
+ int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * largest_layer_params),
135
+ int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR + FP32_PARAM_FACTOR) * total_params / total_gpus)
136
+ )
137
+ else:
138
+ gpu_mem = max(
139
+ int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * largest_layer_params),
140
+ int((precision_fac + FP32_GRADS_FACTOR + OPTIMIZER_FACTOR) * total_params / total_gpus)
141
+ )
142
 
143
  if zero_init:
144
  cpu_mem = largest_layer_params * params_fac * num_gpus_per_node * additional_buffer_factor