|
91 | 91 | "outputs": [],
|
92 | 92 | "source": [
|
93 | 93 | "from google.colab import userdata\n",
|
| 94 | + "import os\n", |
94 | 95 | "os.environ['HUGGINGFACE_TOKEN'] = userdata.get('HUGGINGFACE_TOKEN')"
|
95 | 96 | ]
|
96 | 97 | },
|
|
225 | 226 | "\n",
|
226 | 227 | "We split this dataset into training and validation sets using a 90/10 ratio.\n",
|
227 | 228 | "\n",
|
228 |
| - "🔁 _Note_: When this notebook was initially run, the dataset was loaded locally from a file. However, the same dataset is now also available on the Hugging Face Hub here: [zamal/github-meta-data](https://huggingface.co/datasets/zamal/github-meta-data). Feel free to load it directly using `load_dataset(\"zamal/github-meta-data\")` in your workflow.\n" |
| 229 | + "🔁 _Note_: When this notebook was initially run, the dataset was loaded locally from a file. However, the same dataset is now also available on the Hugging Face Hub here: [zamal/github-meta-data](https://huggingface.co/datasets/zamal/github-meta-data). Feel free to load it directly using `load_dataset(\"zamal/github-meta-data\")` in your workflow as shown below.\n" |
229 | 230 | ]
|
230 | 231 | },
|
231 | 232 | {
|
232 | 233 | "cell_type": "code",
|
233 |
| - "execution_count": null, |
234 |
| - "metadata": { |
235 |
| - "id": "NtMlaBADani6" |
236 |
| - }, |
| 234 | + "execution_count": 4, |
| 235 | + "metadata": {}, |
237 | 236 | "outputs": [],
|
238 | 237 | "source": [
|
239 |
| - "from datasets import DatasetDict\n", |
| 238 | + "from datasets import load_dataset, DatasetDict\n", |
240 | 239 | "\n",
|
241 |
| - "# Load and split local JSONL\n", |
242 |
| - "import json\n", |
243 |
| - "from datasets import Dataset\n", |
| 240 | + "# Load existing dataset with only a \"train\" split\n", |
| 241 | + "dataset = load_dataset(\"zamal/github-meta-data\") # returns DatasetDict\n", |
244 | 242 | "\n",
|
245 |
| - "with open(\"/content/t5_formatted_dataset.jsonl\", \"r\", encoding=\"utf-8\") as f:\n", |
246 |
| - " data = [json.loads(line) for line in f]\n", |
| 243 | + "# Split the train set into train and validation\n", |
| 244 | + "split = dataset[\"train\"].train_test_split(test_size=0.1, seed=42)\n", |
247 | 245 | "\n",
|
248 |
| - "dataset = Dataset.from_list(data)\n", |
249 |
| - "\n", |
250 |
| - "# Train/validation split\n", |
251 |
| - "splits = dataset.train_test_split(test_size=0.1, seed=42)\n", |
| 246 | + "# Wrap into a new DatasetDict\n", |
252 | 247 | "dataset_dict = DatasetDict({\n",
|
253 |
| - " \"train\": splits[\"train\"],\n", |
254 |
| - " \"validation\": splits[\"test\"]\n", |
| 248 | + " \"train\": split[\"train\"],\n", |
| 249 | + " \"validation\": split[\"test\"]\n", |
255 | 250 | "})\n"
|
256 | 251 | ]
|
257 | 252 | },
|
258 | 253 | {
|
259 | 254 | "cell_type": "code",
|
260 |
| - "execution_count": null, |
| 255 | + "execution_count": 5, |
261 | 256 | "metadata": {
|
262 | 257 | "colab": {
|
263 | 258 | "base_uri": "https://localhost:8080/"
|
|
1763 | 1758 | "name": "python3"
|
1764 | 1759 | },
|
1765 | 1760 | "language_info": {
|
1766 |
| - "name": "python" |
| 1761 | + "codemirror_mode": { |
| 1762 | + "name": "ipython", |
| 1763 | + "version": 3 |
| 1764 | + }, |
| 1765 | + "file_extension": ".py", |
| 1766 | + "mimetype": "text/x-python", |
| 1767 | + "name": "python", |
| 1768 | + "nbconvert_exporter": "python", |
| 1769 | + "pygments_lexer": "ipython3", |
| 1770 | + "version": "3.11.5" |
1767 | 1771 | },
|
1768 | 1772 | "widgets": {
|
1769 | 1773 | "application/vnd.jupyter.widget-state+json": {
|
|
0 commit comments