File size: 4,607 Bytes
9b2e755
0c7ef71
 
 
 
9b2e755
8d502c8
0c7ef71
 
 
 
 
 
 
 
 
 
5ad4694
 
ae618a2
ecefacb
0c7ef71
 
 
 
ae618a2
ecefacb
0c7ef71
 
 
 
 
8d502c8
0c7ef71
 
 
 
 
 
 
 
 
 
 
 
3bc9a20
0c7ef71
 
 
 
 
 
 
80f473c
0c7ef71
 
 
80f473c
0c7ef71
 
 
80f473c
 
 
 
0c7ef71
80f473c
 
0c7ef71
 
 
 
 
 
9b2e755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d502c8
9b2e755
 
0c7ef71
9b2e755
0c7ef71
8d502c8
0c7ef71
9b2e755
0c7ef71
9b2e755
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from huggingface_hub import ModelFilter, snapshot_download
from huggingface_hub import ModelCard

import json
import time
from src.submission.check_validity import is_model_on_hub, check_model_card
from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN

def update_models(file_path, models):
    """
    Search through all JSON files in the specified root folder and its subfolders,
    and update the likes key in JSON dict from value of input dict
    """
    with open(file_path, "r") as f:
        model_infos = json.load(f)
        for model_id, data in model_infos.items():
            if model_id not in models:
                data['still_on_hub'] = False
                data['likes'] = 0
                data['downloads'] = 0
                data['created_at'] = ""
                continue

            model_cfg = models[model_id]
            data['likes'] = model_cfg.likes
            data['downloads'] = model_cfg.downloads
            data['created_at'] = str(model_cfg.created_at)
            #data['params'] = get_model_size(model_cfg, data['precision'])
            data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""

            # Is the model still on the hub
            still_on_hub, error, model_config = is_model_on_hub(
                model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
            )
            # If the model doesn't have a model card or a license, we consider it's deleted
            if still_on_hub:
                try:
                    if check_model_card(model_id)[0] is False:
                        still_on_hub = False
                except Exception:
                    still_on_hub = False
            data['still_on_hub'] = still_on_hub

            #  Check if the model is a merge
            is_merge_from_metadata = False
            is_moe_from_metadata = False
            if still_on_hub:
                model_card = ModelCard.load(model_id)

                # Storing the model metadata
                tags = []
                if model_card.data.tags:
                    is_merge_from_metadata = "merge" in model_card.data.tags
                    is_moe_from_metadata = "moe" in model_card.data.tags
                merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
                # If the model is a merge but not saying it in the metadata, we flag it
                is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
                if is_merge_from_model_card or is_merge_from_metadata:
                    tags.append("merge")
                    if not is_merge_from_metadata:
                        tags.append("flagged:undisclosed_merge")
                moe_keywords = ["moe", "mixture of experts"]
                is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
                is_moe_from_name = "moe" in model_id.lower().replace("/", "-").replace("_", "-").split("-")
                if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
                    tags.append("moe")
                    if not is_moe_from_metadata:
                        tags.append("flagged:undisclosed_moe")

            data["tags"] = tags

    with open(file_path, 'w') as f:
        json.dump(model_infos, f, indent=2)

def update_dynamic_files():
    """ This will only update metadata for models already linked in the repo, not add missing ones.
    """
    snapshot_download(
        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
    )

    print("UPDATE_DYNAMIC: Loaded snapshot")
    # Get models
    start = time.time()

    models = list(API.list_models(
        filter=ModelFilter(task="text-generation"),
        full=False,
        cardData=True,
        fetch_config=True,
    ))
    id_to_model = {model.id : model for model in models}

    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")

    start = time.time()

    update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)

    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")

    API.upload_file(
        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
        repo_id=DYNAMIC_INFO_REPO,
        repo_type="dataset",
        commit_message=f"Daily request file update.",
    )
    print(f"UPDATE_DYNAMIC: pushed to hub")