Correct merge issues in 01

ma595 · ma595 · commit 3bc9fb699f72 · 2025-07-08T15:27:10.000+01:00
diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb
@@ -20,7 +20,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Setup\n",
+    "### Colab Setup\n",
     "Run the following cell to install the code and dependencies from github."
    ]
   },
@@ -37,7 +37,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Task 1: look at the data\n",
+    "### Task 1 -- Part (a): look at the data\n",
     "In the following code block, we import the ``load_penguins`` function from the ``palmerpenguins`` package.\n",
     "\n",
     "- Call this function, which returns a single object, and assign it to the variable ``data``.\n",
@@ -296,117 +296,6 @@
     "        return feats, tgt"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import List, Tuple, Any\n",
-    "\n",
-    "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n",
-    "# where `tensor` and `eye` are used for constructing tensors,\n",
-    "# and using a lower-precision float32 is advised for performance\n",
-    "# Task 4: add imports here\n",
-    "# from torch import tensor, eye, float32\n",
-    "\n",
-    "from torch.utils.data import Dataset\n",
-    "\n",
-    "from palmerpenguins import load_penguins\n",
-    "\n",
-    "\n",
-    "class PenguinDataset(Dataset):\n",
-    "    \"\"\"Penguin dataset class.\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    input_keys : List[str]\n",
-    "        The column titles to use in the input feature vectors.\n",
-    "    target_keys : List[str]\n",
-    "        The column titles to use in the target feature vectors.\n",
-    "    train : bool\n",
-    "        If ``True``, this object will serve as the training set, and if\n",
-    "        ``False``, the validation set.\n",
-    "\n",
-    "    Notes\n",
-    "    -----\n",
-    "    The validation split contains 10 male and 10 female penguins of each\n",
-    "    species.\n",
-    "\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(\n",
-    "        self,\n",
-    "        input_keys: List[str],\n",
-    "        target_keys: List[str],\n",
-    "        train: bool,\n",
-    "    ):\n",
-    "        \"\"\"Build ``PenguinDataset``.\"\"\"\n",
-    "        self.input_keys = input_keys\n",
-    "        self.target_keys = target_keys\n",
-    "\n",
-    "        data = load_penguins()\n",
-    "        data = (\n",
-    "            data.loc[~data.isna().any(axis=1)]\n",
-    "            .sort_values(by=sorted(data.keys()))\n",
-    "            .reset_index(drop=True)\n",
-    "        )\n",
-    "        # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n",
-    "        data.sex = (data.sex == \"male\").astype(float)\n",
-    "        self.full_df = data\n",
-    "\n",
-    "        valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n",
-    "            n=10,\n",
-    "            random_state=123,\n",
-    "        )\n",
-    "        # The training items are simply the items *not* in the valid split\n",
-    "        train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n",
-    "\n",
-    "        self.split = {\"train\": train_df, \"valid\": valid_df}[\n",
-    "            \"train\" if train is True else \"valid\"\n",
-    "        ]\n",
-    "\n",
-    "    def __len__(self) -> int:\n",
-    "        \"\"\"Return the length of requested split.\n",
-    "\n",
-    "        Returns\n",
-    "        -------\n",
-    "        int\n",
-    "            The number of items in the dataset.\n",
-    "\n",
-    "        \"\"\"\n",
-    "        return len(self.split)\n",
-    "\n",
-    "    def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n",
-    "        \"\"\"Return an input-target pair.\n",
-    "\n",
-    "        Parameters\n",
-    "        ----------\n",
-    "        idx : int\n",
-    "            Index of the input-target pair to return.\n",
-    "\n",
-    "        Returns\n",
-    "        -------\n",
-    "        in_feats : Any\n",
-    "            Inputs.\n",
-    "        target : Any\n",
-    "            Targets.\n",
-    "\n",
-    "        \"\"\"\n",
-    "        # get the row index (idx) from the dataframe and\n",
-    "        # select relevant column features (provided as input_keys)\n",
-    "        feats = tuple(self.split.iloc[idx][self.input_keys])\n",
-    "\n",
-    "        # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n",
-    "        tgts = tuple(self.split.iloc[idx][self.target_keys])\n",
-    "\n",
-    "        # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n",
-    "\n",
-    "        # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n",
-    "\n",
-    "        return feats, tgts"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -518,28 +407,6 @@
     "Instantiate the `torchvision.transforms.Compose` transformations and pass to the `PenguinsDataset` in [src/ml_workshop/_penguins.py](../src/ml_workshop/_penguins.py), instead of hardcoding as above. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Apply transforms to the data. See Task 4 exercise comments above.\n",
-    "\n",
-    "# Create train_set\n",
-    "\n",
-    "# Create valid_set\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### (Optional) Task 4b: \n",
-    "\n",
-    "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. "
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,