add exercise outputs

ma595 · ma595 · commit bb826e465867 · 2024-07-04T17:18:08.000+01:00
diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb
@@ -32,11 +32,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 75,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Adelie', 'Chinstrap', 'Gentoo']"
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "from palmerpenguins import load_penguins"
+    "from palmerpenguins import load_penguins\n",
+    "\n",
+    "data = load_penguins()\n",
+    "\n",
+    "data\n",
+    "\n",
+    "target_names = sorted(data.species.unique())\n",
+    "\n",
+    "target_names\n"
    ]
   },
   {
@@ -55,9 +74,13 @@
    "source": [
     "### Task 2: creating a ``torch.utils.data.Dataset``\n",
     "\n",
+    "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n",
+    "\n",
+    "- This is helpful because...\n",
+    "\n",
     "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n",
     "\n",
-    "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n",
+    "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n",
     "\n",
     "- Open the file ``src/ml_workshop/_penguins.py``.\n",
     "- Let's examine, and discuss, each of the methods together.\n",
@@ -75,6 +98,78 @@
     "  - ``y_tfms``— ..."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Optional, List, Dict, Tuple, Any\n",
+    "\n",
+    "# import pytorch functions necessary for transformations:\n",
+    "from torch import tensor, float32, eye\n",
+    "\n",
+    "from torch.utils.data import Dataset\n",
+    "from torchvision.transforms import Compose\n",
+    "\n",
+    "from pandas import DataFrame\n",
+    "\n",
+    "from palmerpenguins import load_penguins\n",
+    "\n",
+    "\n",
+    "class PenguinDataset(Dataset):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        input_keys: List[str],\n",
+    "        target_keys: List[str],\n",
+    "        train: bool,\n",
+    "    ):\n",
+    "        \"\"\"Build ``PenguinDataset``.\"\"\"\n",
+    "        self.input_keys = input_keys\n",
+    "        self.target_keys = target_keys\n",
+    "\n",
+    "        data = load_penguins()\n",
+    "        data = (\n",
+    "        data.loc[~data.isna().any(axis=1)]\n",
+    "        .sort_values(by=sorted(data.keys()))\n",
+    "        .reset_index(drop=True)\n",
+    "        )\n",
+    "        # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
+    "        data.sex = (data.sex == \"male\").astype(float)\n",
+    "        self.full_df = data\n",
+    "\n",
+    "        valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
+    "                n=10,\n",
+    "                random_state=123,\n",
+    "            )\n",
+    "        # The training items are simply the items *not* in the valid split\n",
+    "        train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
+    "\n",
+    "        self.split = {\"train\": train_df, \"valid\": valid_df}[\"train\" if train is True else \"valid\"]\n",
+    "\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        return len(self.split)\n",
+    "    \n",
+    "    def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
+    "        # get the row index (idx) from the dataframe and \n",
+    "        # select relevant column features (provided as input_keys)\n",
+    "        feats = self.split.iloc[idx][self.input_keys]\n",
+    "\n",
+    "        # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)  \n",
+    "        tgts = self.split.iloc[idx][self.target_keys]\n",
+    "\n",
+    "        # Exercise #1: convert the feats to PyTorch\n",
+    "        feats = tensor(feats.values, dtype=float32)\n",
+    "\n",
+    "        # Exercise #2: convert this to a 'one-hot vector' \n",
+    "        target_names = sorted(self.full_df.species.unique())\n",
+    "        \n",
+    "        tgts = eye(len(target_names))[target_names.index(tgts.values[0])]\n",
+    "                \n",
+    "        return (feats, tgts)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -93,22 +188,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 109,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor([  42.9000, 5000.0000]), tensor([0., 0., 1.]))"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "from ml_workshop import PenguinDataset\n",
+    "# from ml_workshop import PenguinDataset\n",
     "\n",
-    "data_set = PenguinDataset(\n",
+    "data_set_1 = PenguinDataset(\n",
     "    input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n",
     "    target_keys=[\"species\"],\n",
     "    train=True,\n",
     ")\n",
     "\n",
     "\n",
-    "for features, target in data_set:\n",
-    "    # print the features and targets here\n",
-    "    pass"
+    "# for features, target in data_set:\n",
+    "#     # print the features and targets here\n",
+    "#     print(features, target)\n",
+    "\n",
+    "\n",
+    "data_set_1[0]"
    ]
   },
   {
@@ -417,7 +526,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,