{ "cells": [ { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://ollama.com/library\n" ] } ], "source": [ "from bs4 import BeautifulSoup\n", "from requests import get\n", "# download the HTML content\n", "\n", "base_url = 'https://ollama.com'\n", "library_url = f'{base_url}/library'\n", "print(library_url)\n", "html_content = get(library_url).text\n", "\n", "\n", "# Parse the HTML content with BeautifulSoup\n", "soup = BeautifulSoup(html_content, 'html.parser')\n", "\n", "# Extract all the li elements within the ul\n", "li_items = soup.select('ul[role=\"list\"] > li')\n", "\n", "models = []\n", "\n", "# Iterate over the extracted li elements and print them\n", "for li in li_items:\n", " # get first a tag text\n", " sizes = li.div.div.select('span')\n", " sizes = [size.text for size in sizes]\n", "\n", " pulls = li.div.select('p')[1].select('span')\n", " # remove svg tags from pulls\n", " pulls = [pull.text[:-1] for pull in pulls]\n", " pulls = pulls[0].split('\\xa0')[0].strip()\n", "\n", " model = {\n", " \"name\": li.h2.text.strip(),\n", " \"description\": li.p.text.strip(),\n", " \"url\": f\"{base_url}{li.a['href']}\",\n", " \"params\": sizes,\n", " \"pulls\": pulls \n", " }\n", " models.append(model)\n", "import json\n", "with open('models.json', 'w', encoding=\"utf-8\") as file:\n", " file.write(json.dumps(models, indent=4, ensure_ascii=False))" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: llama3\n", "Model: phi3\n", "Model: wizardlm2\n", "Model: mistral\n", "Model: gemma\n", "Model: mixtral\n", "Model: llama2\n", "Model: codegemma\n", "Model: command-r\n", "Model: command-r-plus\n", "Model: llava\n", "Model: dbrx\n", "Model: codellama\n", "Model: qwen\n", "Model: dolphin-mixtral\n", "Model: llama2-uncensored\n", "Model: deepseek-coder\n", "Model: mistral-openorca\n", "Model: nomic-embed-text\n", "Model: dolphin-mistral\n", "Model: phi\n", "Model: orca-mini\n", "Model: nous-hermes2\n", "Model: zephyr\n", "Model: llama2-chinese\n", "Model: wizard-vicuna-uncensored\n", "Model: starcoder2\n", "Model: vicuna\n", "Model: tinyllama\n", "Model: openhermes\n", "Model: starcoder\n", "Model: openchat\n", "Model: dolphin-llama3\n", "Model: yi\n", "Model: tinydolphin\n", "Model: wizardcoder\n", "Model: stable-code\n", "Model: mxbai-embed-large\n", "Model: neural-chat\n", "Model: phind-codellama\n", "Model: wizard-math\n", "Model: starling-lm\n", "Model: falcon\n", "Model: dolphincoder\n", "Model: nous-hermes\n", "Model: orca2\n", "Model: sqlcoder\n", "Model: stablelm2\n", "Model: dolphin-phi\n", "Model: solar\n", "Model: yarn-llama2\n", "Model: deepseek-llm\n", "Model: codeqwen\n", "Model: bakllava\n", "Model: all-minilm\n", "Model: samantha-mistral\n", "Model: llama3-gradient\n", "Model: medllama2\n", "Model: wizardlm-uncensored\n", "Model: xwinlm\n", "Model: nous-hermes2-mixtral\n", "Model: stable-beluga\n", "Model: wizardlm\n", "Model: codeup\n", "Model: yarn-mistral\n", "Model: everythinglm\n", "Model: meditron\n", "Model: llama-pro\n", "Model: magicoder\n", "Model: stablelm-zephyr\n", "Model: nexusraven\n", "Model: codebooga\n", "Model: mistrallite\n", "Model: llama3-chatqa\n", "Model: wizard-vicuna\n", "Model: snowflake-arctic-embed\n", "Model: llava-llama3\n", "Model: goliath\n", "Model: open-orca-platypus2\n", "Model: moondream\n", "Model: duckdb-nsql\n", "Model: notux\n", "Model: megadolphin\n", "Model: notus\n", "Model: alfred\n", "Model: llava-phi3\n", "Model: falcon2\n" ] } ], "source": [ "for model in models:\n", " tagsurl = f\"{model['url']}/tags\"\n", " tags_page = get(tagsurl).text\n", " # Parse the HTML content with BeautifulSoup\n", " soup = BeautifulSoup(tags_page, 'html.parser')\n", " # select links with the class group\n", " tags = soup.select('a.group')\n", " print(f\"Model: {model['name']}\")\n", " model_tags = []\n", " for tag in tags:\n", " # get the parent div of the tag\n", " parent = tag.parent\n", " sizes = parent.parent.select('div.items-baseline')[0].text.strip().split(' • ',2)\n", " # strip each size\n", " sizes = [size.strip() for size in sizes]\n", " model_tags.append({\n", " \"name\": tag.text.strip(),\n", " \"url\": f\"{base_url}{tag['href']}\",\n", " \"size\": sizes[1],\n", " \"hash\": sizes[0],\n", " \"updated\": sizes[2],\n", " })\n", " link = tag['href']\n", " #print(sizes,\"----\")\n", " # get the next sibling of the parent div\n", " sibling = parent.select('span')\n", " if len(sibling) == 1:\n", " hash = sibling[0].text.strip()\n", " if len(sibling) == 3:\n", " size = sibling[2].strip()\n", " else:\n", " pass\n", " #print(sibling)\n", " model[\"tags\"] = model_tags\n", "with open('models.json', 'w', encoding=\"utf-8\") as file:\n", " file.write(json.dumps(models, indent=4, ensure_ascii=False))" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }