{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!pip install bertopic\n", "\n", "# bertopicのmodelを作るscript" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/user/miniconda3/envs/ft/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from bertopic import BERTopic" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "streaming=True\n", "dataset_list =[\n", " load_dataset('mc4', 'ja', split='train',streaming=streaming),\n", " load_dataset('oscar', 'unshuffled_deduplicated_ja', split='train',streaming=streaming),\n", " load_dataset('cc100', lang='ja', split='train',streaming=streaming),\n", " load_dataset(\"augmxnt/shisa-pretrain-en-ja-v1\",split=\"train\",streaming=streaming),\n", " load_dataset(\"hpprc/wikipedia-20240101\", split=\"train\",streaming=streaming),\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "10000it [00:20, 482.63it/s]\n", "10000it [00:19, 524.44it/s]\n", "10000it [00:12, 778.96it/s]\n", "10000it [00:25, 386.40it/s]\n", "10000it [00:58, 171.79it/s]\n" ] } ], "source": [ "from tqdm import tqdm\n", "docs=[]\n", "#prepare data for training model\n", "for dataset in dataset_list:\n", " cnt=0\n", " for record in tqdm(dataset):\n", " text=record[\"text\"]\n", " docs.append(text)\n", " cnt+=1\n", "\n", " if cnt>10000:\n", " break\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-03-12 08:37:19,823 - BERTopic - Embedding - Transforming documents to embeddings.\n", "Batches: 100%|██████████| 1563/1563 [00:50<00:00, 30.79it/s] \n", "2024-03-12 08:38:20,622 - BERTopic - Embedding - Completed ✓\n", "2024-03-12 08:38:20,622 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm\n", "2024-03-12 08:38:59,566 - BERTopic - Dimensionality - Completed ✓\n", "2024-03-12 08:38:59,567 - BERTopic - Cluster - Start clustering the reduced embeddings\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "2024-03-12 08:46:25,241 - BERTopic - Cluster - Completed ✓\n", "2024-03-12 08:46:25,242 - BERTopic - Representation - Extracting topics from clusters using representation models.\n", "2024-03-12 08:47:25,876 - BERTopic - Representation - Completed ✓\n", "2024-03-12 08:47:25,952 - BERTopic - Topic reduction - Reducing number of topics\n", "2024-03-12 08:48:28,300 - BERTopic - Topic reduction - Reduced number of topics from 435 to 342\n" ] } ], "source": [ "\n", "model_path=\"data/topic_model.bin\"\n", "topic_model = BERTopic(language=\"japanese\", calculate_probabilities=True, verbose=True, nr_topics=\"20\")\n", "topics, probs = topic_model.fit_transform(docs)\n", "\n", "\n", "#topic_model=BERTopic.load(model_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-03-12 08:48:42,599 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.\n", "/home/user/miniconda3/envs/ft/lib/python3.11/site-packages/scipy/sparse/_index.py:143: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.\n", " self._set_arrayXarray(i, j, x)\n" ] } ], "source": [ "topic_model.save(model_path)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TopicCountNameRepresentationRepresentative_Docs
0-122559-1_the_and_to_of[the, and, to, of, 送料無料, in, 12, 11, 10, また][Створення сайту - Сторінка 419 - Форум\\nЧетве...
1015850_送料無料_サマータイヤ_代引不可_中古[送料無料, サマータイヤ, 代引不可, 中古, ブラック, diy, レディース, 工具,...[上品なスタイル 【5/1(土)クーポン&ワンダフルデー 4本1台分!!】 215/45R1...
2112091_としあき_無念_name_投稿日[としあき, 無念, name, 投稿日, id, 16, 名前, 柳宗理, no, 11][ハニーセレクト日曜昼の部テンプレセット髪型全然使ってなかったけど - ふたろぐばこ−二次元...
328012_ワンピース_5cm_レディース_着丈[ワンピース, 5cm, レディース, 着丈, 肩幅, 素材, 格安通販, シューズ, 袖丈...[非売品 入学式 セレモニー 秋冬 秋 他と被らない 冬 小さいサイズ スカート セット 卒...
437993_ベンジャミン_フランクリン_passion_thee[ベンジャミン, フランクリン, passion, thee, nベンジャミン, 全業種, ...[it's ok with me 意味\\t9\\n英語で「It's okay.(イッツオーケー...
..................
33733611336_abuse_you_counselling_emotional[abuse, you, counselling, emotional, addiction...[スピリチュアルカウンセリングは、魂の向上を目的とした、至高神からのヒーリングで魂を整えて頂...
33833710337_京都の道_snorkeling_その1_中の池[京都の道, snorkeling, その1, 中の池, k7, silfra, 今だけ特別...[オアフ島(ホノルル) 福岡発 ◎今だけ無料で海の見える部屋へアップグレード!◎シェラトン・...
33933810338_実印_いつ使う_件のレビュー例えば_いつ使うは[実印, いつ使う, 件のレビュー例えば, いつ使うは, しっかりした会社, 印鑑, 実印の...[冊子の「契約内容のお知らせ」ページをめくると、登録情報の変更シートがあります。\\n, 今回...
34033910339_galaxy_s7_samsung_edge[galaxy, s7, samsung, edge, i9195i, s8, 3i9200...[ S8 PlusとS9 Plus - bajatyoutube.com\\n2019/0...
34134010340_゚д゚_対価_労働_産業別組合[゚д゚, 対価, 労働, 産業別組合, 工会, 約款, union, 契約書, 規約, 労...[ただし、中小企業の事業主等、労働者以外でも業務の実態や災害の発生状況からみて、労働者に準じ...
\n", "

342 rows × 5 columns

\n", "
" ], "text/plain": [ " Topic Count Name \\\n", "0 -1 22559 -1_the_and_to_of \n", "1 0 1585 0_送料無料_サマータイヤ_代引不可_中古 \n", "2 1 1209 1_としあき_無念_name_投稿日 \n", "3 2 801 2_ワンピース_5cm_レディース_着丈 \n", "4 3 799 3_ベンジャミン_フランクリン_passion_thee \n", ".. ... ... ... \n", "337 336 11 336_abuse_you_counselling_emotional \n", "338 337 10 337_京都の道_snorkeling_その1_中の池 \n", "339 338 10 338_実印_いつ使う_件のレビュー例えば_いつ使うは \n", "340 339 10 339_galaxy_s7_samsung_edge \n", "341 340 10 340_゚д゚_対価_労働_産業別組合 \n", "\n", " Representation \\\n", "0 [the, and, to, of, 送料無料, in, 12, 11, 10, また] \n", "1 [送料無料, サマータイヤ, 代引不可, 中古, ブラック, diy, レディース, 工具,... \n", "2 [としあき, 無念, name, 投稿日, id, 16, 名前, 柳宗理, no, 11] \n", "3 [ワンピース, 5cm, レディース, 着丈, 肩幅, 素材, 格安通販, シューズ, 袖丈... \n", "4 [ベンジャミン, フランクリン, passion, thee, nベンジャミン, 全業種, ... \n", ".. ... \n", "337 [abuse, you, counselling, emotional, addiction... \n", "338 [京都の道, snorkeling, その1, 中の池, k7, silfra, 今だけ特別... \n", "339 [実印, いつ使う, 件のレビュー例えば, いつ使うは, しっかりした会社, 印鑑, 実印の... \n", "340 [galaxy, s7, samsung, edge, i9195i, s8, 3i9200... \n", "341 [゚д゚, 対価, 労働, 産業別組合, 工会, 約款, union, 契約書, 規約, 労... \n", "\n", " Representative_Docs \n", "0 [Створення сайту - Сторінка 419 - Форум\\nЧетве... \n", "1 [上品なスタイル 【5/1(土)クーポン&ワンダフルデー 4本1台分!!】 215/45R1... \n", "2 [ハニーセレクト日曜昼の部テンプレセット髪型全然使ってなかったけど - ふたろぐばこ−二次元... \n", "3 [非売品 入学式 セレモニー 秋冬 秋 他と被らない 冬 小さいサイズ スカート セット 卒... \n", "4 [it's ok with me 意味\\t9\\n英語で「It's okay.(イッツオーケー... \n", ".. ... \n", "337 [スピリチュアルカウンセリングは、魂の向上を目的とした、至高神からのヒーリングで魂を整えて頂... \n", "338 [オアフ島(ホノルル) 福岡発 ◎今だけ無料で海の見える部屋へアップグレード!◎シェラトン・... \n", "339 [冊子の「契約内容のお知らせ」ページをめくると、登録情報の変更シートがあります。\\n, 今回... \n", "340 [ S8 PlusとS9 Plus - bajatyoutube.com\\n2019/0... \n", "341 [ただし、中小企業の事業主等、労働者以外でも業務の実態や災害の発生状況からみて、労働者に準じ... \n", "\n", "[342 rows x 5 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_model.get_topic_info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ft", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }