huggingface
diff --git a/‎notebooks/en/finetune_t5_for_search_tag_generation.ipynb
Lines changed: 23 additions & 19 deletions b/‎notebooks/en/finetune_t5_for_search_tag_generation.ipynb
Lines changed: 23 additions & 19 deletions
diff --git a/‎notebooks/en/index.md
Lines changed: 1 addition & 2 deletions b/‎notebooks/en/index.md
Lines changed: 1 addition & 2 deletions
@@ -91,6 +91,7 @@
       "outputs": [],
       "source": [
         "from google.colab import userdata\n",
+        "import os\n",
         "os.environ['HUGGINGFACE_TOKEN'] = userdata.get('HUGGINGFACE_TOKEN')"
       ]
     },
@@ -225,39 +226,33 @@
         "\n",
         "We split this dataset into training and validation sets using a 90/10 ratio.\n",
         "\n",
-        "🔁 _Note_: When this notebook was initially run, the dataset was loaded locally from a file. However, the same dataset is now also available on the Hugging Face Hub here: [zamal/github-meta-data](https://huggingface.co/datasets/zamal/github-meta-data). Feel free to load it directly using `load_dataset(\"zamal/github-meta-data\")` in your workflow.\n"
+        "🔁 _Note_: When this notebook was initially run, the dataset was loaded locally from a file. However, the same dataset is now also available on the Hugging Face Hub here: [zamal/github-meta-data](https://huggingface.co/datasets/zamal/github-meta-data). Feel free to load it directly using `load_dataset(\"zamal/github-meta-data\")` in your workflow as shown below.\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NtMlaBADani6"
-      },
+      "execution_count": 4,
+      "metadata": {},
       "outputs": [],
       "source": [
-        "from datasets import DatasetDict\n",
+        "from datasets import load_dataset, DatasetDict\n",
         "\n",
-        "# Load and split local JSONL\n",
-        "import json\n",
-        "from datasets import Dataset\n",
+        "# Load existing dataset with only a \"train\" split\n",
+        "dataset = load_dataset(\"zamal/github-meta-data\")  # returns DatasetDict\n",
         "\n",
-        "with open(\"/content/t5_formatted_dataset.jsonl\", \"r\", encoding=\"utf-8\") as f:\n",
-        "    data = [json.loads(line) for line in f]\n",
+        "# Split the train set into train and validation\n",
+        "split = dataset[\"train\"].train_test_split(test_size=0.1, seed=42)\n",
         "\n",
-        "dataset = Dataset.from_list(data)\n",
-        "\n",
-        "# Train/validation split\n",
-        "splits = dataset.train_test_split(test_size=0.1, seed=42)\n",
+        "# Wrap into a new DatasetDict\n",
         "dataset_dict = DatasetDict({\n",
-        "    \"train\": splits[\"train\"],\n",
-        "    \"validation\": splits[\"test\"]\n",
+        "    \"train\": split[\"train\"],\n",
+        "    \"validation\": split[\"test\"]\n",
         "})\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 5,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -1763,7 +1758,16 @@
       "name": "python3"
     },
     "language_info": {
-      "name": "python"
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.5"
     },
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {
 
@@ -7,12 +7,11 @@ applications and solving various machine learning tasks using open-source tools
 
 Check out the recently added notebooks:
 
+- [Fine-tuning T5 for Automatic GitHub Tag Generation with PEFT](finetune_t5_for_search_tag_generation)
 - [Documentation Chatbot with Meta Synthetic Data Kit](fine_tune_chatbot_docs_synthetic)
 - [HuatuoGPT-o1 Medical RAG and Reasoning](medical_rag_and_Reasoning)
 - [Fine-tuning Granite Vision 3.1 2B with TRL](fine_tuning_granite_vision_sft_trl)
 - [Post training an LLM for reasoning with GRPO in TRL](fine_tuning_llm_grpo_trl)
-- [Evaluating AI Search Engines with `judges` - the open-source library for LLM-as-a-judge evaluators](llm_judge_evaluating_ai_search_engines_with_judges_library)
-- [Fine-tuning T5 for Automatic GitHub Tag Generation with PEFT](finetune_t5_for_search_tag_generation)
 
 
 You can also check out the notebooks in the cookbook's [GitHub repo](https://github.com/huggingface/cookbook).