asoria HF staff commited on
Commit
e62a0e5
β€’
1 Parent(s): 93c417c
Files changed (4) hide show
  1. README.md +5 -5
  2. requirements.txt +6 -0
  3. utils/ __init__.py +0 -0
  4. utils/notebook_utils.py +184 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Auto Notebook Creator
3
- emoji: πŸ’»
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: Auto notebook creator
3
+ emoji: πŸ“”
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.39.0
8
  app_file: app.py
9
  pinned: false
10
  ---
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio_huggingfacehub_search==0.0.7
2
+ huggingface_hub
3
+ nbformat
4
+ httpx
5
+ outlines
6
+ python-dotenv
utils/ __init__.py ADDED
File without changes
utils/notebook_utils.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def replace_wildcards(templates, wildcards, replacements):
2
+ if len(wildcards) != len(replacements):
3
+ raise ValueError(
4
+ "The number of wildcards must match the number of replacements."
5
+ )
6
+
7
+ new_templates = []
8
+ for tmp in templates:
9
+ tmp_text = tmp["source"]
10
+ for wildcard, replacement in zip(wildcards, replacements):
11
+ tmp_text = tmp_text.replace(wildcard, replacement)
12
+ new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
13
+
14
+ return new_templates
15
+
16
+
17
+ rag_cells = [
18
+ {
19
+ "cell_type": "markdown",
20
+ "source": "# Retrieval-Augmented Generation (RAG) System Notebook",
21
+ },
22
+ {"cell_type": "code", "source": ""},
23
+ ]
24
+
25
+ embeggins_cells = [
26
+ {
27
+ "cell_type": "markdown",
28
+ "source": "# Embeddings Generation Notebook",
29
+ },
30
+ {"cell_type": "code", "source": ""},
31
+ ]
32
+
33
+ eda_cells = [
34
+ {
35
+ "cell_type": "markdown",
36
+ "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset",
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "source": """
41
+ from IPython.display import HTML
42
+ display(HTML("{html_code}"))
43
+ """,
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "source": """
48
+ # 1. Install and import necessary libraries.
49
+ !pip install pandas matplotlib seaborn
50
+ """,
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "source": """
55
+ import pandas as pd
56
+ import matplotlib.pyplot as plt
57
+ import seaborn as sns
58
+ """,
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "source": """
63
+ # 2. Load the dataset as a DataFrame using the provided code
64
+ {first_code}
65
+ """,
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "source": """
70
+ # 3. Understand the dataset structure
71
+ print(df.head())
72
+ print(df.info())
73
+ print(df.describe())
74
+ """,
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "source": """
79
+ # 4. Check for missing values
80
+ print(df.isnull().sum())
81
+ """,
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "source": """
86
+ # 5. Identify data types of each column
87
+ print(df.dtypes)
88
+ """,
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "source": """
93
+ # 6. Detect duplicated rows
94
+ print(df.duplicated().sum())
95
+ """,
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "source": """
100
+ # 7. Generate descriptive statistics
101
+ print(df.describe())
102
+ """,
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "source": """
107
+ # 8. Visualize the distribution of each column.
108
+ # TODO: Add code to visualize the distribution of each column.
109
+ # 9. Explore relationships between columns.
110
+ # TODO: Add code to explore relationships between columns.
111
+ # 10. Perform correlation analysis.
112
+ # TODO: Add code to perform correlation analysis.
113
+ """,
114
+ },
115
+ ]
116
+
117
+
118
+ def generate_embedding_system_prompt():
119
+ """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
120
+ Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
121
+
122
+ The notebook should include:
123
+
124
+ 1. Install necessary libraries with !pip install.
125
+ 2. Import libraries.
126
+ 3. Load the dataset as a DataFrame using the provided code.
127
+ 4. Select the column to generate embeddings.
128
+ 5. Remove duplicate data.
129
+ 6. Convert the selected column to a list.
130
+ 7. Load the sentence-transformers model.
131
+ 8. Create a FAISS index.
132
+ 9. Encode a query sample.
133
+ 10. Search for similar documents using the FAISS index.
134
+
135
+ Ensure the notebook is well-organized with explanations for each step.
136
+ The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
137
+
138
+ The user will provide dataset information in the following format:
139
+
140
+ ## Columns and Data Types
141
+
142
+ ## Sample Data
143
+
144
+ ## Loading Data code
145
+
146
+ Use the provided code to load the dataset; do not use any other method.
147
+ """
148
+
149
+
150
+ def generate_rag_system_prompt():
151
+ """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
152
+ The dataset is provided as a pandas DataFrame.
153
+
154
+ Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
155
+
156
+ The RAG notebook should include:
157
+
158
+ 1. Install necessary libraries.
159
+ 2. Import libraries.
160
+ 3. Load the dataset as a DataFrame using the provided code.
161
+ 4. Select the column for generating embeddings.
162
+ 5. Remove duplicate data.
163
+ 6. Convert the selected column to a list.
164
+ 7. Load the sentence-transformers model.
165
+ 8. Create a FAISS index.
166
+ 9. Encode a query sample.
167
+ 10. Search for similar documents using the FAISS index.
168
+ 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
169
+ 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
170
+ 13. Send the prompt to the pipeline and display the answer.
171
+
172
+ Ensure the notebook is well-organized with explanations for each step.
173
+ The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
174
+
175
+ The user will provide the dataset information in the following format:
176
+
177
+ ## Columns and Data Types
178
+
179
+ ## Sample Data
180
+
181
+ ## Loading Data code
182
+
183
+ Use the provided code to load the dataset; do not use any other method.
184
+ """