File size: 4,475 Bytes
9b2e755
0c7ef71
 
 
 
9b2e755
 
0c7ef71
 
 
 
 
 
 
 
 
 
5ad4694
 
ae618a2
 
0c7ef71
 
 
 
ae618a2
 
0c7ef71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80f473c
0c7ef71
 
 
80f473c
0c7ef71
 
 
80f473c
 
 
 
0c7ef71
80f473c
 
0c7ef71
 
 
 
 
 
9b2e755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7ef71
9b2e755
0c7ef71
9b2e755
0c7ef71
9b2e755
0c7ef71
9b2e755
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from huggingface_hub import ModelFilter, snapshot_download
from huggingface_hub import ModelCard

import json
import time
from src.submission.check_validity import is_model_on_hub, check_model_card
from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API

def update_models(file_path, models):
    """
    Search through all JSON files in the specified root folder and its subfolders,
    and update the likes key in JSON dict from value of input dict
    """
    with open(file_path, "r") as f:
        model_infos = json.load(f)
        for model_id, data in model_infos.items():
            if model_id not in models:
                data['still_on_hub'] = False
                data['likes'] = 0
                data['downloads'] = 0
                data['created_at'] = None
                continue

            model_cfg = models[model_id]
            data['likes'] = model_cfg.likes
            data['downloads'] = model_cfg.downloads
            data['created_at'] = model_cfg.created_at
            #data['params'] = get_model_size(model_cfg, data['precision'])
            data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""

            # Is the model still on the hub
            still_on_hub, error, model_config = is_model_on_hub(
                model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False
            )
            # If the model doesn't have a model card or a license, we consider it's deleted
            if still_on_hub:
                try:
                    if check_model_card(model_id)[0] is False:
                        still_on_hub = False
                except Exception:
                    still_on_hub = False
            data['still_on_hub'] = still_on_hub

            #  Check if the model is a merge
            is_merge_from_metadata = False
            if still_on_hub:
                model_card = ModelCard.load(model_id)

                # Storing the model metadata
                tags = []
                if model_card.data.tags:
                    is_merge_from_metadata = "merge" in model_card.data.tags
                    is_moe_from_metadata = "moe" in model_card.data.tags
                merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
                # If the model is a merge but not saying it in the metadata, we flag it
                is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
                if is_merge_from_model_card or is_merge_from_metadata:
                    tags.append("merge")
                    if not is_merge_from_metadata:
                        tags.append("flagged:undisclosed_merge")
                moe_keywords = ["moe", "mixture of experts"]
                is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in moe_keywords)
                is_moe_from_name = "moe" in model_id.lower().replace("/", "-").replace("_", "-").split("-")
                if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
                    tags.append("moe")
                    if not is_moe_from_metadata:
                        tags.append("flagged:undisclosed_moe")

            data["tags"] = tags

    with open(file_path, 'w') as f:
        json.dump(model_infos, f, indent=2)

def update_dynamic_files():
    """ This will only update metadata for models already linked in the repo, not add missing ones.
    """
    snapshot_download(
        repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
    )

    print("UPDATE_DYNAMIC: Loaded snapshot")
    # Get models
    start = time.time()

    models = list(API.list_models(
        filter=ModelFilter(task="text-generation"),
        full=False,
        cardData=True,
        fetch_config=True,
    ))

    print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")

    start = time.time()

    update_models(DYNAMIC_INFO_FILE_PATH, models)

    print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")

    API.upload_file(
        path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
        path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
        repo_id=DYNAMIC_INFO_REPO,
        repo_type="dataset",
        commit_message=f"Daily request file update.",
    )
    print(f"UPDATE_DYNAMIC: pushed to hub")