asoria HF staff commited on
Commit
0b212ec
1 Parent(s): 1c042c7

Adding eda and rag as templates

Browse files
app.py CHANGED
@@ -32,6 +32,11 @@ client = Client(headers=HEADERS)
32
 
33
  logging.basicConfig(level=logging.INFO)
34
 
 
 
 
 
 
35
 
36
  def get_compatible_libraries(dataset: str):
37
  try:
@@ -116,11 +121,6 @@ def _push_to_hub(
116
  raise
117
 
118
 
119
- folder_path = "notebooks"
120
- notebook_templates = load_json_files_from_folder(folder_path)
121
- logging.info(f"Available notebooks {notebook_templates.keys()}")
122
-
123
-
124
  def generate_cells(dataset_id, notebook_title):
125
  logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
126
  cells = notebook_templates[notebook_title]["notebook_template"]
@@ -248,7 +248,9 @@ with gr.Blocks(
248
  gr.Markdown("## 2. Select the type of notebook you want to generate")
249
  with gr.Row():
250
  notebook_type = gr.Dropdown(
251
- choices=notebook_templates.keys(), label="Notebook type"
 
 
252
  )
253
  generate_button = gr.Button("Generate Notebook", variant="primary")
254
  contribute_btn = gr.Button(
 
32
 
33
  logging.basicConfig(level=logging.INFO)
34
 
35
+ # TODO: Validate notebook templates format
36
+ folder_path = "notebooks"
37
+ notebook_templates = load_json_files_from_folder(folder_path)
38
+ logging.info(f"Available notebooks {notebook_templates.keys()}")
39
+
40
 
41
  def get_compatible_libraries(dataset: str):
42
  try:
 
121
  raise
122
 
123
 
 
 
 
 
 
124
  def generate_cells(dataset_id, notebook_title):
125
  logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
126
  cells = notebook_templates[notebook_title]["notebook_template"]
 
248
  gr.Markdown("## 2. Select the type of notebook you want to generate")
249
  with gr.Row():
250
  notebook_type = gr.Dropdown(
251
+ choices=notebook_templates.keys(),
252
+ label="Notebook type",
253
+ value="Text Embeddings",
254
  )
255
  generate_button = gr.Button("Generate Notebook", variant="primary")
256
  contribute_btn = gr.Button(
notebooks/eda.json CHANGED
@@ -5,7 +5,7 @@
5
  "notebook_template": [
6
  {
7
  "cell_type": "markdown",
8
- "source": "\n---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---\n"
9
  },
10
  {
11
  "cell_type": "markdown",
@@ -13,15 +13,15 @@
13
  },
14
  {
15
  "cell_type": "code",
16
- "source": "\n# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn\n"
17
  },
18
  {
19
  "cell_type": "code",
20
- "source": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n"
21
  },
22
  {
23
  "cell_type": "code",
24
- "source": "\n# Load the dataset as a DataFrame\n{first_code}\n"
25
  },
26
  {
27
  "cell_type": "markdown",
@@ -29,28 +29,28 @@
29
  },
30
  {
31
  "cell_type": "code",
32
- "source": "\n# First rows of the dataset and info\nprint(df.head())\nprint(df.info())\n"
33
  },
34
  {
35
  "cell_type": "code",
36
- "source": "\n# Check for missing values\nprint(df.isnull().sum())\n"
37
  },
38
  {
39
  "cell_type": "code",
40
- "source": "\n# Identify data types of each column\nprint(df.dtypes)\n"
41
  },
42
  {
43
  "cell_type": "code",
44
- "source": "\n# Detect duplicated rows\nprint(df.duplicated().sum())\n"
45
  },
46
  {
47
  "cell_type": "code",
48
- "source": "\n# Generate descriptive statistics\nprint(df.describe())\n"
49
  },
50
  {
51
  "type": "categoric",
52
  "cell_type": "code",
53
- "source": "\n# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()\n"
54
  },
55
  {
56
  "cell_type": "markdown",
@@ -59,22 +59,22 @@
59
  {
60
  "type": "numeric",
61
  "cell_type": "code",
62
- "source": "\n# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()\n"
63
  },
64
  {
65
  "type": "numeric",
66
  "cell_type": "code",
67
- "source": "\n# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.histplot(df[column], kde=True)\n plt.title(f'Distribution of {column}')\n plt.xlabel(column)\n plt.ylabel('Frequency')\n plt.show()\n"
68
  },
69
  {
70
  "type": "categoric",
71
  "cell_type": "code",
72
- "source": "\n# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n plt.figure(figsize=(8, 4))\n sns.countplot(x=column, data=df)\n plt.title(f'Count Plot of {column}')\n plt.xlabel(column)\n plt.ylabel('Count')\n plt.show()\n"
73
  },
74
  {
75
  "type": "numeric",
76
  "cell_type": "code",
77
- "source": "\n# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.boxplot(df[column])\n plt.title(f'Box Plot of {column}')\n plt.xlabel(column)\n plt.show()\n"
78
  }
79
  ]
80
  }
 
5
  "notebook_template": [
6
  {
7
  "cell_type": "markdown",
8
+ "source": "---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---"
9
  },
10
  {
11
  "cell_type": "markdown",
 
13
  },
14
  {
15
  "cell_type": "code",
16
+ "source": "# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn"
17
  },
18
  {
19
  "cell_type": "code",
20
+ "source": "import matplotlib.pyplot as plt\nimport seaborn as sns"
21
  },
22
  {
23
  "cell_type": "code",
24
+ "source": "# Load the dataset as a DataFrame\n{first_code}"
25
  },
26
  {
27
  "cell_type": "markdown",
 
29
  },
30
  {
31
  "cell_type": "code",
32
+ "source": "# First rows of the dataset and info\nprint(df.head())\nprint(df.info())"
33
  },
34
  {
35
  "cell_type": "code",
36
+ "source": "# Check for missing values\nprint(df.isnull().sum())"
37
  },
38
  {
39
  "cell_type": "code",
40
+ "source": "# Identify data types of each column\nprint(df.dtypes)"
41
  },
42
  {
43
  "cell_type": "code",
44
+ "source": "# Detect duplicated rows\nprint(df.duplicated().sum())"
45
  },
46
  {
47
  "cell_type": "code",
48
+ "source": "# Generate descriptive statistics\nprint(df.describe())"
49
  },
50
  {
51
  "type": "categoric",
52
  "cell_type": "code",
53
+ "source": "# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()"
54
  },
55
  {
56
  "cell_type": "markdown",
 
59
  {
60
  "type": "numeric",
61
  "cell_type": "code",
62
+ "source": "# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()"
63
  },
64
  {
65
  "type": "numeric",
66
  "cell_type": "code",
67
+ "source": "# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.histplot(df[column], kde=True)\n plt.title(f'Distribution of {column}')\n plt.xlabel(column)\n plt.ylabel('Frequency')\n plt.show()"
68
  },
69
  {
70
  "type": "categoric",
71
  "cell_type": "code",
72
+ "source": "# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n plt.figure(figsize=(8, 4))\n sns.countplot(x=column, data=df)\n plt.title(f'Count Plot of {column}')\n plt.xlabel(column)\n plt.ylabel('Count')\n plt.show()"
73
  },
74
  {
75
  "type": "numeric",
76
  "cell_type": "code",
77
+ "source": "# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n plt.figure(figsize=(8, 4))\n sns.boxplot(df[column])\n plt.title(f'Box Plot of {column}')\n plt.xlabel(column)\n plt.show()"
78
  }
79
  ]
80
  }
notebooks/finetuning.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "notebook_title": "Supervised fine-tuning (SFT)",
3
- "notebook_type": "sft",
4
- "dataset_type": "numeric",
5
- "notebook_template": []
6
- }
 
 
 
 
 
 
 
notebooks/rag.json CHANGED
@@ -2,5 +2,86 @@
2
  "notebook_title": "Retrieval-augmented generation (RAG)",
3
  "notebook_type": "rag",
4
  "dataset_type": "text",
5
- "notebook_template": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
2
  "notebook_title": "Retrieval-augmented generation (RAG)",
3
  "notebook_type": "rag",
4
  "dataset_type": "text",
5
+ "notebook_template": [
6
+ {
7
+ "cell_type": "markdown",
8
+ "source": "---\n# **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**\n---"
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "source": "## 1. Setup necessary libraries and load the dataset"
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "source": "# Install and import necessary libraries.\n!pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub"
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "source": "from sentence_transformers import SentenceTransformer\nfrom transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\nfrom huggingface_hub import InferenceClient\nimport pandas as pd\nimport faiss\nimport torch"
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "source": "# Load the dataset as a DataFrame\n{first_code}"
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "source": "# Specify the column name that contains the text data to generate embeddings\ncolumn_to_generate_embeddings = '{longest_col}'"
29
+ },
30
+ {
31
+ "cell_type": "markdown",
32
+ "source": "## 2. Loading embedding model and creating FAISS index"
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "source": "# Remove duplicate entries based on the specified column\ndf = df.drop_duplicates(subset=column_to_generate_embeddings)"
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "source": "# Convert the column data to a list of text entries\ntext_list = df[column_to_generate_embeddings].tolist()"
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "source": "# Specify the embedding model you want to use\nmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "source": "vectors = model.encode(text_list)\nvector_dimension = vectors.shape[1]\n\n# Initialize the FAISS index with the appropriate dimension (384 for this model)\nindex = faiss.IndexFlatL2(vector_dimension)\n\n# Encode the text list into embeddings and add them to the FAISS index\nindex.add(vectors)"
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "source": "## 3. Perform a text search"
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "source": "# Specify the text you want to search for in the list\nquery = \"How to cook sushi?\"\n\n# Generate the embedding for the search query\nquery_embedding = model.encode([query])"
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "source": "# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)\nD, I = index.search(query_embedding, k=10)\n\n# Print the similar documents found\nprint(f\"Similar documents: {[text_list[i] for i in I[0]]}\")"
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "source": "## 4. Load pipeline and perform inference locally"
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "source": "# Adjust model name as needed\ncheckpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\" # for GPU usage or \"cpu\" for CPU usage\n\ntokenizer = AutoTokenizer.from_pretrained(checkpoint)\nmodel = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)\n\ngenerator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=0 if device == \"cuda\" else -1)"
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "source": "# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query\nselected_elements = [text_list[i] for i in I[0].tolist()]\ncontext = ','.join(selected_elements)\nmessages = [\n {\n \"role\": \"system\",\n \"content\": f\"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}\",\n },\n {\"role\": \"user\", \"content\": query},\n]"
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "source": "# Send the prompt to the pipeline and show the answer\noutput = generator(messages)\nprint(\"Generated result:\")\nprint(output[0]['generated_text'][-1]['content']) # Print the assistant's response content"
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "source": "## 5. Alternatively call the inference client"
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "source": "# Adjust model name as needed\ncheckpoint = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n\n# Change here your Hugging Face API token\ntoken = \"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" \n\ninference_client = InferenceClient(checkpoint, token=token)\noutput = inference_client.chat_completion(messages=messages, stream=False)\nprint(\"Generated result:\")\nprint(output.choices[0].message.content)"
85
+ }
86
+ ]
87
  }
utils/notebook_utils.py CHANGED
@@ -24,463 +24,6 @@ def replace_wildcards(
24
  return new_templates
25
 
26
 
27
- embeddings_cells = [
28
- {
29
- "cell_type": "markdown",
30
- "source": """
31
- ---
32
- # **Embeddings Notebook for {dataset_name} dataset**
33
- ---
34
- """,
35
- },
36
- {
37
- "cell_type": "markdown",
38
- "source": "## 1. Setup necessary libraries and load the dataset",
39
- },
40
- {
41
- "cell_type": "code",
42
- "source": """
43
- # Install and import necessary libraries.
44
- !pip install pandas sentence-transformers faiss-cpu
45
- """,
46
- },
47
- {
48
- "cell_type": "code",
49
- "source": """
50
- import pandas as pd
51
- from sentence_transformers import SentenceTransformer
52
- import faiss
53
- """,
54
- },
55
- {
56
- "cell_type": "code",
57
- "source": """
58
- # Load the dataset as a DataFrame
59
- {first_code}
60
- """,
61
- },
62
- {
63
- "cell_type": "code",
64
- "source": """
65
- # Specify the column name that contains the text data to generate embeddings
66
- column_to_generate_embeddings = '{longest_col}'
67
- """,
68
- },
69
- {
70
- "cell_type": "markdown",
71
- "source": "## 2. Loading embedding model and creating FAISS index",
72
- },
73
- {
74
- "cell_type": "code",
75
- "source": """
76
- # Remove duplicate entries based on the specified column
77
- df = df.drop_duplicates(subset=column_to_generate_embeddings)
78
- """,
79
- },
80
- {
81
- "cell_type": "code",
82
- "source": """
83
- # Convert the column data to a list of text entries
84
- text_list = df[column_to_generate_embeddings].tolist()
85
- """,
86
- },
87
- {
88
- "cell_type": "code",
89
- "source": """
90
- # Specify the embedding model you want to use
91
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
92
- """,
93
- },
94
- {
95
- "cell_type": "code",
96
- "source": """
97
- vectors = model.encode(text_list)
98
- vector_dimension = vectors.shape[1]
99
-
100
- # Initialize the FAISS index with the appropriate dimension (384 for this model)
101
- index = faiss.IndexFlatL2(vector_dimension)
102
-
103
- # Encode the text list into embeddings and add them to the FAISS index
104
- index.add(vectors)
105
- """,
106
- },
107
- {
108
- "cell_type": "markdown",
109
- "source": "## 3. Perform a text search",
110
- },
111
- {
112
- "cell_type": "code",
113
- "source": """
114
- # Specify the text you want to search for in the list
115
- text_to_search = text_list[0]
116
- print(f"Text to search: {text_to_search}")
117
- """,
118
- },
119
- {
120
- "cell_type": "code",
121
- "source": """
122
- # Generate the embedding for the search query
123
- query_embedding = model.encode([text_to_search])
124
- """,
125
- },
126
- {
127
- "cell_type": "code",
128
- "source": """
129
- # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
130
- D, I = index.search(query_embedding, k=10)
131
-
132
- # Print the similar documents found
133
- print(f"Similar documents: {[text_list[i] for i in I[0]]}")
134
- """,
135
- },
136
- ]
137
-
138
- eda_cells = [
139
- {
140
- "cell_type": "markdown",
141
- "source": """
142
- ---
143
- # **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
144
- ---
145
- """,
146
- },
147
- {
148
- "cell_type": "markdown",
149
- "source": "## 1. Setup necessary libraries and load the dataset",
150
- },
151
- {
152
- "cell_type": "code",
153
- "source": """
154
- # Install and import necessary libraries.
155
- !pip install pandas matplotlib seaborn
156
- """,
157
- },
158
- {
159
- "cell_type": "code",
160
- "source": """
161
- import pandas as pd
162
- import matplotlib.pyplot as plt
163
- import seaborn as sns
164
- """,
165
- },
166
- {
167
- "cell_type": "code",
168
- "source": """
169
- # Load the dataset as a DataFrame
170
- {first_code}
171
- """,
172
- },
173
- {
174
- "cell_type": "markdown",
175
- "source": "## 2. Understanding the Dataset",
176
- },
177
- {
178
- "cell_type": "code",
179
- "source": """
180
- # First rows of the dataset and info
181
- print(df.head())
182
- print(df.info())
183
- """,
184
- },
185
- {
186
- "cell_type": "code",
187
- "source": """
188
- # Check for missing values
189
- print(df.isnull().sum())
190
- """,
191
- },
192
- {
193
- "cell_type": "code",
194
- "source": """
195
- # Identify data types of each column
196
- print(df.dtypes)
197
- """,
198
- },
199
- {
200
- "cell_type": "code",
201
- "source": """
202
- # Detect duplicated rows
203
- print(df.duplicated().sum())
204
- """,
205
- },
206
- {
207
- "cell_type": "code",
208
- "source": """
209
- # Generate descriptive statistics
210
- print(df.describe())
211
- """,
212
- },
213
- {
214
- "type": "categoric",
215
- "cell_type": "code",
216
- "source": """
217
- # Unique values in categorical columns
218
- df.select_dtypes(include=['object']).nunique()
219
- """,
220
- },
221
- {
222
- "cell_type": "markdown",
223
- "source": "## 3. Data Visualization",
224
- },
225
- {
226
- "type": "numeric",
227
- "cell_type": "code",
228
- "source": """
229
- # Correlation matrix for numerical columns
230
- corr_matrix = df.corr(numeric_only=True)
231
- plt.figure(figsize=(10, 8))
232
- sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
233
- plt.title('Correlation Matrix')
234
- plt.show()
235
- """,
236
- },
237
- {
238
- "type": "numeric",
239
- "cell_type": "code",
240
- "source": """
241
- # Distribution plots for numerical columns
242
- for column in df.select_dtypes(include=['int64', 'float64']).columns:
243
- plt.figure(figsize=(8, 4))
244
- sns.histplot(df[column], kde=True)
245
- plt.title(f'Distribution of {column}')
246
- plt.xlabel(column)
247
- plt.ylabel('Frequency')
248
- plt.show()
249
- """,
250
- },
251
- {
252
- "type": "categoric",
253
- "cell_type": "code",
254
- "source": """
255
- # Count plots for categorical columns
256
- for column in df.select_dtypes(include=['object']).columns:
257
- plt.figure(figsize=(8, 4))
258
- sns.countplot(x=column, data=df)
259
- plt.title(f'Count Plot of {column}')
260
- plt.xlabel(column)
261
- plt.ylabel('Count')
262
- plt.show()
263
- """,
264
- },
265
- {
266
- "type": "numeric",
267
- "cell_type": "code",
268
- "source": """
269
- # Box plots for detecting outliers in numerical columns
270
- for column in df.select_dtypes(include=['int64', 'float64']).columns:
271
- plt.figure(figsize=(8, 4))
272
- sns.boxplot(df[column])
273
- plt.title(f'Box Plot of {column}')
274
- plt.xlabel(column)
275
- plt.show()
276
- """,
277
- },
278
- ]
279
-
280
-
281
- rag_cells = [
282
- {
283
- "cell_type": "markdown",
284
- "source": """
285
- ---
286
- # **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**
287
- ---
288
- """,
289
- },
290
- {
291
- "cell_type": "markdown",
292
- "source": "## 1. Setup necessary libraries and load the dataset",
293
- },
294
- {
295
- "cell_type": "code",
296
- "source": """
297
- # Install and import necessary libraries.
298
- !pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub
299
- """,
300
- },
301
- {
302
- "cell_type": "code",
303
- "source": """
304
- from sentence_transformers import SentenceTransformer
305
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
306
- from huggingface_hub import InferenceClient
307
- import pandas as pd
308
- import faiss
309
- import torch
310
- """,
311
- },
312
- {
313
- "cell_type": "code",
314
- "source": """
315
- # Load the dataset as a DataFrame
316
- {first_code}
317
- """,
318
- },
319
- {
320
- "cell_type": "code",
321
- "source": """
322
- # Specify the column name that contains the text data to generate embeddings
323
- column_to_generate_embeddings = '{longest_col}'
324
- """,
325
- },
326
- {
327
- "cell_type": "markdown",
328
- "source": "## 2. Loading embedding model and creating FAISS index",
329
- },
330
- {
331
- "cell_type": "code",
332
- "source": """
333
- # Remove duplicate entries based on the specified column
334
- df = df.drop_duplicates(subset=column_to_generate_embeddings)
335
- """,
336
- },
337
- {
338
- "cell_type": "code",
339
- "source": """
340
- # Convert the column data to a list of text entries
341
- text_list = df[column_to_generate_embeddings].tolist()
342
- """,
343
- },
344
- {
345
- "cell_type": "code",
346
- "source": """
347
- # Specify the embedding model you want to use
348
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
349
- """,
350
- },
351
- {
352
- "cell_type": "code",
353
- "source": """
354
- vectors = model.encode(text_list)
355
- vector_dimension = vectors.shape[1]
356
-
357
- # Initialize the FAISS index with the appropriate dimension (384 for this model)
358
- index = faiss.IndexFlatL2(vector_dimension)
359
-
360
- # Encode the text list into embeddings and add them to the FAISS index
361
- index.add(vectors)
362
- """,
363
- },
364
- {
365
- "cell_type": "markdown",
366
- "source": "## 3. Perform a text search",
367
- },
368
- {
369
- "cell_type": "code",
370
- "source": """
371
- # Specify the text you want to search for in the list
372
- query = "How to cook sushi?"
373
-
374
- # Generate the embedding for the search query
375
- query_embedding = model.encode([query])
376
- """,
377
- },
378
- {
379
- "cell_type": "code",
380
- "source": """
381
- # Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
382
- D, I = index.search(query_embedding, k=10)
383
-
384
- # Print the similar documents found
385
- print(f"Similar documents: {[text_list[i] for i in I[0]]}")
386
- """,
387
- },
388
- {
389
- "cell_type": "markdown",
390
- "source": "## 4. Load pipeline and perform inference locally",
391
- },
392
- {
393
- "cell_type": "code",
394
- "source": """
395
- # Adjust model name as needed
396
- checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'
397
-
398
- device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage
399
-
400
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
401
- model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
402
-
403
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
404
- """,
405
- },
406
- {
407
- "cell_type": "code",
408
- "source": """
409
- # Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
410
- selected_elements = [text_list[i] for i in I[0].tolist()]
411
- context = ','.join(selected_elements)
412
- messages = [
413
- {
414
- "role": "system",
415
- "content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}",
416
- },
417
- {"role": "user", "content": query},
418
- ]
419
- """,
420
- },
421
- {
422
- "cell_type": "code",
423
- "source": """
424
- # Send the prompt to the pipeline and show the answer
425
- output = generator(messages)
426
- print("Generated result:")
427
- print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content
428
- """,
429
- },
430
- {
431
- "cell_type": "markdown",
432
- "source": "## 5. Alternatively call the inference client",
433
- },
434
- {
435
- "cell_type": "code",
436
- "source": """
437
- # Adjust model name as needed
438
- checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
439
-
440
- # Change here your Hugging Face API token
441
- token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
442
-
443
- inference_client = InferenceClient(checkpoint, token=token)
444
- output = inference_client.chat_completion(messages=messages, stream=False)
445
- print("Generated result:")
446
- print(output.choices[0].message.content)
447
- """,
448
- },
449
- ]
450
-
451
-
452
- def generate_rag_system_prompt():
453
- """
454
-
455
- 1. Install necessary libraries.
456
- 2. Import libraries.
457
- 3. Load the dataset as a DataFrame using the provided code.
458
- 4. Select the column for generating embeddings.
459
- 5. Remove duplicate data.
460
- 6. Convert the selected column to a list.
461
- 7. Load the sentence-transformers model.
462
- 8. Create a FAISS index.
463
- 9. Encode a query sample.
464
- 10. Search for similar documents using the FAISS index.
465
- 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
466
- 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
467
- 13. Send the prompt to the pipeline and display the answer.
468
-
469
- Ensure the notebook is well-organized with explanations for each step.
470
- The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
471
-
472
- The user will provide the dataset information in the following format:
473
-
474
- ## Columns and Data Types
475
-
476
- ## Sample Data
477
-
478
- ## Loading Data code
479
-
480
- Use the provided code to load the dataset; do not use any other method.
481
- """
482
-
483
-
484
  def load_json_files_from_folder(folder_path):
485
  components = {}
486
 
 
24
  return new_templates
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def load_json_files_from_folder(folder_path):
28
  components = {}
29