File size: 5,258 Bytes
0dea527
 
 
 
 
 
 
 
674c962
0dea527
 
 
964360b
f020c04
0dea527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674c962
0dea527
 
674c962
0dea527
 
674c962
0dea527
 
 
 
 
 
 
 
674c962
964360b
 
674c962
 
 
 
 
964360b
674c962
964360b
674c962
964360b
0dea527
674c962
964360b
674c962
964360b
674c962
964360b
674c962
 
 
0dea527
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Utilities related to loading in and working with models/specific models
from urllib.parse import urlparse

import gradio as gr
import torch
from accelerate.commands.estimate import check_has_model, create_empty_model
from accelerate.utils import calculate_maximum_sizes, convert_bytes
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
from parallelism_utils import estimate_zero1_model_states_mem_needs, estimate_zero2_model_states_mem_needs, estimate_zero3_model_states_mem_needs


DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
PRECISION = {"Mixed precision": "mixed", "Single precision": "single"}
DTYPE = {"float32": "float32", "float16/bfloat16": "float16"}


def extract_from_url(name: str):
    "Checks if `name` is a URL, and if so converts it to a model name"
    is_url = False
    try:
        result = urlparse(name)
        is_url = all([result.scheme, result.netloc])
    except Exception:
        is_url = False
    # Pass through if not a URL
    if not is_url:
        return name
    else:
        path = result.path
        return path[1:]


def translate_llama2(text):
    "Translates llama-2 to its hf counterpart"
    if not text.endswith("-hf"):
        return text + "-hf"
    return text


def get_model(model_name: str, library: str, access_token: str):
    "Finds and grabs model from the Hub, and initializes on `meta`"
    if "meta-llama" in model_name:
        model_name = translate_llama2(model_name)
    if library == "auto":
        library = None
    model_name = extract_from_url(model_name)
    try:
        model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
    except GatedRepoError:
        raise gr.Error(
            f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
        )
    except RepositoryNotFoundError:
        raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
    except ValueError:
        raise gr.Error(
            f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
        )
    except (RuntimeError, OSError) as e:
        library = check_has_model(e)
        if library != "unknown":
            raise gr.Error(
                f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
            )
        raise gr.Error(
            f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
        )
    except ImportError:
        # hacky way to check if it works with `trust_remote_code=False`
        model = create_empty_model(
            model_name, library_name=library, trust_remote_code=False, access_token=access_token
        )
    except Exception as e:
        raise gr.Error(
            f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
        )
    return model


def calculate_memory(model: torch.nn.Module, options: dict):
    "Calculates the memory usage for a model init on `meta` device"
    total_size, largest_layer = calculate_maximum_sizes(model)
    total_params = model.num_parameters()

    data = []
    for dtype in options["precision"]:
        dtype_total_size = total_size
        dtype_largest_layer = largest_layer[0]

        modifier = DTYPE_MODIFIER[dtype]
        dtype_total_size /= modifier
        dtype_largest_layer /= modifier

        dtype_largest_layer = convert_bytes(dtype_largest_layer)

        precision = PRECISION[options["training_regime"]]
        model_dtype = DTYPE[dtype]

        if options["zero_stage"] == 0:
            cpu_mem = dtype_total_size * 4
            gpu_mem = cpu_mem
        elif options["zero_stage"] == 1:
            cpu_mem, gpu_mem = estimate_zero1_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
        elif options["zero_stage"] == 2:
            cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params, options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["additional_buffer_factor"], precision, model_dtype)
        elif options["zero_stage"] == 3:
            cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs(total_params, largest_layer[0], options["num_gpus_per_node"], options["num_nodes"], options["cpu_offload"], options["cpu_offload_params"], options["zero_init"], options["additional_buffer_factor"], precision, model_dtype)
        data.append(
                {
                    "Model dtype": dtype,
                    "Largest Layer or Residual Group": dtype_largest_layer,
                    "Model Size": convert_bytes(dtype_total_size),
                    "per CPU": convert_bytes(cpu_mem),
                    "per GPU": convert_bytes(gpu_mem),
                }
            )

    return data