From 724a7e1e8d872b7fa887d835f6c848019c554bb8 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Thu, 4 Jul 2024 16:47:28 +0100 Subject: [PATCH 1/9] Update the classification solutions notebook to utilise the PenguinDataset defined within the notebook itself --- .../01_penguin_classification_solutions.ipynb | 361 ++++++------------ 1 file changed, 117 insertions(+), 244 deletions(-) diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index fcab877..bfb814d 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -32,38 +32,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " bill_length_mm bill_depth_mm flipper_length_mm body_mass_g \\\n", - "count 342.000000 342.000000 342.000000 342.000000 \n", - "mean 43.921930 17.151170 200.915205 4201.754386 \n", - "std 5.459584 1.974793 14.061714 801.954536 \n", - "min 32.100000 13.100000 172.000000 2700.000000 \n", - "25% 39.225000 15.600000 190.000000 3550.000000 \n", - "50% 44.450000 17.300000 197.000000 4050.000000 \n", - "75% 48.500000 18.700000 213.000000 4750.000000 \n", - "max 59.600000 21.500000 231.000000 6300.000000 \n", - "\n", - " year \n", - "count 344.000000 \n", - "mean 2008.029070 \n", - "std 0.818356 \n", - "min 2007.000000 \n", - "25% 2007.000000 \n", - "50% 2008.000000 \n", - "75% 2009.000000 \n", - "max 2009.000000 \n", - "Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',\n", - " 'flipper_length_mm', 'body_mass_g', 'sex', 'year'],\n", - " dtype='object')\n" - ] - } - ], + "outputs": [], "source": [ "from palmerpenguins import load_penguins\n", "\n", @@ -108,6 +79,14 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", + "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", + "\n", + "- Why is this class representation helpful?\n", + " - Modularity - Separation of concerns makes the cde easier to understand, maintain and test.\n", + " - Maintainability - Changes are localised, therefore we only need to change a single file to update. \n", + " - Abstraction - Users do not need to know how the data is read or processed, they only need to know how to interact with the class. \n", + "\n", + "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", @@ -123,9 +102,79 @@ "- Review and discuss the class arguments.\n", " - ``input_keys``— A sequence of strings telling the data set which objects to return as inputs to the model.\n", " - ``target_keys``— Same as ``input_keys`` but specifying the targets.\n", - " - ``train``— A boolean variable determining if the model returns the training or validation split (``True`` for training).\n", - " - ``x_tfms``— A ``Compose`` object with functions which will convert the raw input to a tensor. This argument is _optional_.\n", - " - ``y_tfms``— A ``Compose`` object with functions which will convert the raw target to a tensor. This argument is _optional_." + " - ``train``— A boolean variable determining if the model returns the training or validation split (``True`` for training)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional, List, Dict, Tuple, Any\n", + "\n", + "# import pytorch functions necessary for transformations:\n", + "from torch import tensor, float32, eye\n", + "\n", + "from torch.utils.data import Dataset\n", + "from torchvision.transforms import Compose\n", + "\n", + "from pandas import DataFrame\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\n", + " \"train\" if train is True else \"valid\"\n", + " ]\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.split)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " # get the row index (idx) from the dataframe and\n", + " # select relevant column features (provided as input_keys)\n", + " feats = tuple(self.split.iloc[idx][self.input_keys])\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", + " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", + "\n", + " # Exercise #1: convert the feats (Series) to PyTorch Tensors\n", + " feats = tensor(feats, dtype=float32)\n", + "\n", + " # Exercise #2: convert target to a 'one-hot' vector.\n", + " target_names = sorted(self.full_df.species.unique())\n", + " tgts = eye(len(target_names))[target_names.index(tgts[0])]\n", + "\n", + " return feats, tgts" ] }, { @@ -146,39 +195,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(42.9, 13.1, 5000.0, 215.0, 0.0) ('Gentoo',)\n", - "(46.1, 13.2, 4500.0, 211.0, 0.0) ('Gentoo',)\n", - "(44.9, 13.3, 5100.0, 213.0, 0.0) ('Gentoo',)\n", - "(43.3, 13.4, 4400.0, 209.0, 0.0) ('Gentoo',)\n", - "(42.0, 13.5, 4150.0, 210.0, 0.0) ('Gentoo',)\n", - "(46.5, 13.5, 4550.0, 210.0, 0.0) ('Gentoo',)\n", - "(44.0, 13.6, 4350.0, 208.0, 0.0) ('Gentoo',)\n", - "(40.9, 13.7, 4650.0, 214.0, 0.0) ('Gentoo',)\n", - "(42.6, 13.7, 4950.0, 213.0, 0.0) ('Gentoo',)\n", - "(42.7, 13.7, 3950.0, 208.0, 0.0) ('Gentoo',)\n", - "(45.3, 13.7, 4300.0, 210.0, 0.0) ('Gentoo',)\n", - "(47.2, 13.7, 4925.0, 214.0, 0.0) ('Gentoo',)\n", - "(45.2, 13.8, 4750.0, 215.0, 0.0) ('Gentoo',)\n", - "(43.6, 13.9, 4900.0, 217.0, 0.0) ('Gentoo',)\n", - "(43.8, 13.9, 4300.0, 208.0, 0.0) ('Gentoo',)\n", - "(45.5, 13.9, 4200.0, 210.0, 0.0) ('Gentoo',)\n", - "(45.7, 13.9, 4400.0, 214.0, 0.0) ('Gentoo',)\n", - "(43.3, 14.0, 4575.0, 208.0, 0.0) ('Gentoo',)\n", - "(47.5, 14.0, 4875.0, 212.0, 0.0) ('Gentoo',)\n", - "(46.2, 14.1, 4375.0, 217.0, 0.0) ('Gentoo',)\n" - ] - } - ], + "outputs": [], "source": [ - "from ml_workshop import PenguinDataset\n", - "\n", "features = [\n", " \"bill_length_mm\",\n", " \"bill_depth_mm\",\n", @@ -196,16 +216,21 @@ ")\n", "\n", "for _, (input_feats, target) in zip(range(20), data_set):\n", - " print(input_feats, target)" + " print(input_feats, target)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Can we give these items to a neural network, or do they need to be transformed first?\n", - " - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s." + "* Can we give these items to a neural network, or do they need to be transformed first?\n", + " + Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html)\n", + " + The targets are tuples of strings i.e. ('Gentoo', )\n", + " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", + " \"A\" — [1, 0, 0]\\\n", + " \"B\" — [0, 1, 0]\\\n", + " \"C\" — [0, 0, 1]\n" ] }, { @@ -214,33 +239,22 @@ "source": [ "### Task 4: Applying transforms to the data\n", "\n", - "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data.\n", - "\n", - "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n", - "\n", - "- Note: here we create a training and validation set.\n", + "- Here we create a training and validation set.\n", " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", - "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data." + "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", + "\n", + "\n", + "Note: A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", + "\n", + "These transforms can be very useful for mapping between file paths and tensors of images, etc." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", - "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", - "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", - "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", - "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n" - ] - } - ], + "outputs": [], "source": [ "from torchvision.transforms import Compose\n", "\n", @@ -249,58 +263,10 @@ "# and using a lower-precision float32 is advised for performance\n", "from torch import tensor, float32, eye\n", "\n", - "\n", - "# Apply the transforms we need to the PenguinDataset to get out input\n", - "# targets as Tensors.\n", - "\n", - "\n", - "def get_input_transforms() -> Compose:\n", - " \"\"\"Return transforms which map from raw inputs to tensors.\n", - "\n", - " Returns\n", - " -------\n", - " Compose\n", - " A composition of transforms (callable functions) to map the tuple\n", - " of input features (``Tuple[float, ...]``) to a ``torch.Tensor``.\n", - "\n", - " Notes\n", - " -----\n", - " To create a ``torch.Tensor`` we can use ``torch.tensor([1.0, 2.0, ...])``\n", - "\n", - " \"\"\"\n", - " return Compose([lambda x: tensor(x, dtype=float32)])\n", - "\n", - "\n", - "def get_target_tfms() -> Compose:\n", - " \"\"\"Return transforms which map from the raw target strings to tensor.\n", - " Returns\n", - " -------\n", - " Compose\n", - " A composition of transforms (callable functions) to map the tuple\n", - " of input features (``Tuple[str]``) to a ``torch.Tensor``.\n", - "\n", - " Notes\n", - " -----\n", - " Suppose we have three labels, \"A\", \"B\" and \"C\". We want to encoder each\n", - " distinct label as a one-hot-encoded vector. A natural way to do this is:\n", - " - \"A\" — [1, 0, 0]\n", - " - \"B\" — [0, 1, 0]\n", - " - \"C\" — [0, 0, 1]\n", - "\n", - " The transforms this function produces will return these vectors as tensors.\n", - " Note also, in the example we have just given, A's vector was the first row\n", - " in the identity matrix, B's the second, etc.\n", - "\n", - " \"\"\"\n", - " return Compose([lambda x: eye(len(target_names))[target_names.index(x[0])]])\n", - "\n", - "\n", "train_set = PenguinDataset(\n", " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=True,\n", - " x_tfms=get_input_transforms(),\n", - " y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -308,8 +274,6 @@ " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=False,\n", - " x_tfms=get_input_transforms(),\n", - " y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -336,20 +300,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([16, 5]) torch.Size([16, 3])\n", - "torch.Size([16, 5]) torch.Size([16, 3])\n", - "torch.Size([16, 5]) torch.Size([16, 3])\n", - "torch.Size([12, 5]) torch.Size([12, 3])\n" - ] - } - ], + "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "\n", @@ -387,30 +340,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FCNet(\n", - " (_fwd_seq): Sequential(\n", - " (0): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (1): Linear(in_features=5, out_features=16, bias=True)\n", - " (2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (3): Dropout(p=0.1, inplace=False)\n", - " (4): LeakyReLU(negative_slope=0.1)\n", - " (5): Linear(in_features=16, out_features=16, bias=True)\n", - " (6): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (7): Dropout(p=0.1, inplace=False)\n", - " (8): LeakyReLU(negative_slope=0.1)\n", - " (9): Linear(in_features=16, out_features=3, bias=True)\n", - " )\n", - ")\n" - ] - } - ], + "outputs": [], "source": [ "from torch.nn import Module\n", "from torch.nn import BatchNorm1d, Linear, LeakyReLU, Dropout, Sequential\n", @@ -485,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -535,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -692,39 +624,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 0-25 time: 1.934173 seconds\n", - "Epoch 25-50 time: 1.844448 seconds\n", - "Epoch 50-75 time: 1.831056 seconds\n", - "Epoch 75-100 time: 1.817979 seconds\n", - "Epoch 100-125 time: 1.822820 seconds\n", - "Epoch 125-150 time: 1.842434 seconds\n", - "Epoch 150-175 time: 1.967782 seconds\n", - "\n", - "\n", - " loss_train accuracy_train loss_valid accuracy_valid\n", - "0 0.578070 0.496324 0.586362 0.484375\n", - "1 0.490388 0.742647 0.495531 0.750000\n", - "2 0.417000 0.819853 0.406423 0.781250\n", - "3 0.371912 0.841912 0.356070 0.828125\n", - "4 0.325209 0.871324 0.310226 0.890625\n", - ".. ... ... ... ...\n", - "195 0.019916 0.988971 0.026766 0.984375\n", - "196 0.021192 0.988971 0.023146 0.984375\n", - "197 0.022928 0.988971 0.024764 0.984375\n", - "198 0.023786 0.985294 0.026085 0.984375\n", - "199 0.023932 0.981618 0.031793 0.984375\n", - "\n", - "[200 rows x 4 columns]\n" - ] - } - ], + "outputs": [], "source": [ "from time import perf_counter\n", "\n", @@ -774,20 +676,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from numpy import linspace\n", @@ -832,27 +723,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Raw input:\n", - "tensor([[4.2900e+01, 1.3100e+01, 5.0000e+03, 2.1500e+02, 0.0000e+00],\n", - " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", - "\n", - "Raw output:\n", - "tensor([[2.4082e-05, 4.3393e-06, 9.9997e-01],\n", - " [8.5355e-01, 6.9033e-06, 1.4644e-01]])\n", - "\n", - "Predicted species:\n", - "['Gentoo', 'Adelie']\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "from torch import no_grad\n", "\n", @@ -894,7 +767,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.2" } }, "nbformat": 4, From c07528baf5c07e65074de03b7ac6a55c78dbd3ce Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Thu, 4 Jul 2024 17:20:28 +0100 Subject: [PATCH 2/9] Add solution outputs --- .../01_penguin_classification_solutions.ipynb | 199 ++++++++++++++++-- 1 file changed, 179 insertions(+), 20 deletions(-) diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index bfb814d..ffee52f 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -32,9 +32,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " bill_length_mm bill_depth_mm flipper_length_mm body_mass_g \\\n", + "count 342.000000 342.000000 342.000000 342.000000 \n", + "mean 43.921930 17.151170 200.915205 4201.754386 \n", + "std 5.459584 1.974793 14.061714 801.954536 \n", + "min 32.100000 13.100000 172.000000 2700.000000 \n", + "25% 39.225000 15.600000 190.000000 3550.000000 \n", + "50% 44.450000 17.300000 197.000000 4050.000000 \n", + "75% 48.500000 18.700000 213.000000 4750.000000 \n", + "max 59.600000 21.500000 231.000000 6300.000000 \n", + "\n", + " year \n", + "count 344.000000 \n", + "mean 2008.029070 \n", + "std 0.818356 \n", + "min 2007.000000 \n", + "25% 2007.000000 \n", + "50% 2008.000000 \n", + "75% 2009.000000 \n", + "max 2009.000000 \n", + "Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',\n", + " 'flipper_length_mm', 'body_mass_g', 'sex', 'year'],\n", + " dtype='object')\n" + ] + } + ], "source": [ "from palmerpenguins import load_penguins\n", "\n", @@ -107,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -195,9 +224,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.5000, 13.5000, 4550.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.0000, 13.6000, 4350.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 40.9000, 13.7000, 4650.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.6000, 13.7000, 4950.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.7000, 13.7000, 3950.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.3000, 13.7000, 4300.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 47.2000, 13.7000, 4925.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.2000, 13.8000, 4750.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.6000, 13.9000, 4900.0000, 217.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.8000, 13.9000, 4300.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.5000, 13.9000, 4200.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 45.7000, 13.9000, 4400.0000, 214.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 14.0000, 4575.0000, 208.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 47.5000, 14.0000, 4875.0000, 212.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.2000, 14.1000, 4375.0000, 217.0000, 0.0000]) tensor([0., 0., 1.])\n" + ] + } + ], "source": [ "features = [\n", " \"bill_length_mm\",\n", @@ -252,9 +308,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n" + ] + } + ], "source": [ "from torchvision.transforms import Compose\n", "\n", @@ -300,9 +368,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([16, 5]) torch.Size([16, 3])\n", + "torch.Size([16, 5]) torch.Size([16, 3])\n", + "torch.Size([16, 5]) torch.Size([16, 3])\n", + "torch.Size([12, 5]) torch.Size([12, 3])\n" + ] + } + ], "source": [ "from torch.utils.data import DataLoader\n", "\n", @@ -340,9 +419,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FCNet(\n", + " (_fwd_seq): Sequential(\n", + " (0): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (1): Linear(in_features=5, out_features=16, bias=True)\n", + " (2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (3): Dropout(p=0.1, inplace=False)\n", + " (4): LeakyReLU(negative_slope=0.1)\n", + " (5): Linear(in_features=16, out_features=16, bias=True)\n", + " (6): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (7): Dropout(p=0.1, inplace=False)\n", + " (8): LeakyReLU(negative_slope=0.1)\n", + " (9): Linear(in_features=16, out_features=3, bias=True)\n", + " )\n", + ")\n" + ] + } + ], "source": [ "from torch.nn import Module\n", "from torch.nn import BatchNorm1d, Linear, LeakyReLU, Dropout, Sequential\n", @@ -417,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -439,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -467,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -624,9 +724,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 0-25 time: 11.470797 seconds\n", + "Epoch 25-50 time: 10.322494 seconds\n", + "Epoch 50-75 time: 8.206659 seconds\n", + "Epoch 75-100 time: 9.680337 seconds\n", + "Epoch 100-125 time: 9.795119 seconds\n", + "Epoch 125-150 time: 10.032203 seconds\n", + "Epoch 150-175 time: 9.441857 seconds\n", + "\n", + "\n", + " loss_train accuracy_train loss_valid accuracy_valid\n", + "0 0.630636 0.507353 0.607762 0.500000\n", + "1 0.535187 0.742647 0.533367 0.640625\n", + "2 0.464604 0.790441 0.462660 0.671875\n", + "3 0.409597 0.797794 0.404585 0.687500\n", + "4 0.352028 0.808824 0.362779 0.687500\n", + ".. ... ... ... ...\n", + "195 0.017349 0.992647 0.023540 0.984375\n", + "196 0.018951 0.988971 0.020961 0.984375\n", + "197 0.037160 0.985294 0.022133 0.984375\n", + "198 0.032350 0.988971 0.027387 0.984375\n", + "199 0.016947 0.992647 0.025869 0.984375\n", + "\n", + "[200 rows x 4 columns]\n" + ] + } + ], "source": [ "from time import perf_counter\n", "\n", @@ -676,9 +806,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import matplotlib.pyplot as plt\n", "from numpy import linspace\n", @@ -723,9 +864,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw input:\n", + "tensor([[4.2900e+01, 1.3100e+01, 5.0000e+03, 2.1500e+02, 0.0000e+00],\n", + " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", + "\n", + "Raw output:\n", + "tensor([[7.7734e-06, 3.7739e-08, 9.9999e-01],\n", + " [9.9173e-01, 2.2448e-06, 8.2675e-03]])\n", + "\n", + "Predicted species:\n", + "['Gentoo', 'Adelie']\n", + "\n" + ] + } + ], "source": [ "from torch import no_grad\n", "\n", From 43b80d6cde9e46be644fe82db017d552629caa6e Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Thu, 4 Jul 2024 17:24:11 +0100 Subject: [PATCH 3/9] Add 'task 4' to exercises --- worked-solutions/01_penguin_classification_solutions.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index ffee52f..354aa95 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -196,10 +196,10 @@ " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", "\n", - " # Exercise #1: convert the feats (Series) to PyTorch Tensors\n", + " # Task 4 - Exercise #1: convert the feats (Series) to PyTorch Tensors\n", " feats = tensor(feats, dtype=float32)\n", "\n", - " # Exercise #2: convert target to a 'one-hot' vector.\n", + " # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n", " target_names = sorted(self.full_df.species.unique())\n", " tgts = eye(len(target_names))[target_names.index(tgts[0])]\n", "\n", From 27d046429573395946cac804630288a368882bdd Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Thu, 4 Jul 2024 18:38:19 +0100 Subject: [PATCH 4/9] Add docstrings to class and re-add the Compose object approach as an n optional exercise --- exercises/01_penguin_classification.ipynb | 175 ++++++++++++- .../01_penguin_classification_solutions.ipynb | 241 ++++++++++++++---- 2 files changed, 357 insertions(+), 59 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 95eb953..e219ac6 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -55,9 +55,13 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", + "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", + "\n", + "- Why is this helpful?\n", + "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", - "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", + "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n", "\n", "- Open the file ``src/ml_workshop/_penguins.py``.\n", "- Let's examine, and discuss, each of the methods together.\n", @@ -75,6 +79,117 @@ " - ``y_tfms``— ..." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Tuple, Any\n", + "\n", + "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", + "# where `tensor` and `eye` are used for constructing tensors,\n", + "# and using a lower-precision float32 is advised for performance\n", + "# Task 4: add imports here\n", + "# from torch import tensor, eye, float32\n", + "\n", + "from torch.utils.data import Dataset\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " \"\"\"Penguin dataset class.\n", + "\n", + " Parameters\n", + " ----------\n", + " input_keys : List[str]\n", + " The column titles to use in the input feature vectors.\n", + " target_keys : List[str]\n", + " The column titles to use in the target feature vectors.\n", + " train : bool\n", + " If ``True``, this object will serve as the training set, and if\n", + " ``False``, the validation set.\n", + "\n", + " Notes\n", + " -----\n", + " The validation split contains 10 male and 10 female penguins of each\n", + " species.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\n", + " \"train\" if train is True else \"valid\"\n", + " ]\n", + "\n", + " def __len__(self) -> int:\n", + " \"\"\"Return the length of requested split.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of items in the dataset.\n", + "\n", + " \"\"\"\n", + " return len(self.split)\n", + "\n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " \"\"\"Return an input-target pair.\n", + "\n", + " Parameters\n", + " ----------\n", + " idx : int\n", + " Index of the input-target pair to return.\n", + "\n", + " Returns\n", + " -------\n", + " in_feats : Any\n", + " Inputs.\n", + " target : Any\n", + " Targets.\n", + "\n", + " \"\"\"\n", + " # get the row index (idx) from the dataframe and\n", + " # select relevant column features (provided as input_keys)\n", + " feats = tuple(self.split.iloc[idx][self.input_keys])\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", + " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", + "\n", + " # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n", + "\n", + " # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n", + "\n", + " return feats, tgts" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -97,8 +212,6 @@ "metadata": {}, "outputs": [], "source": [ - "from ml_workshop import PenguinDataset\n", - "\n", "data_set = PenguinDataset(\n", " input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n", " target_keys=[\"species\"],\n", @@ -107,7 +220,7 @@ "\n", "\n", "for features, target in data_set:\n", - " # print the features and targets here\n", + " # print the features and targets her\n", " pass" ] }, @@ -117,7 +230,12 @@ "source": [ "- Can we give these items to a neural network, or do they need to be transformed first?\n", " - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s." + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html).\n", + " - The targets are tuples of strings i.e. ('Gentoo', )\n", + " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", + " \"A\" — [1, 0, 0]\\\n", + " \"B\" — [0, 1, 0]\\\n", + " \"C\" — [0, 0, 1]" ] }, { @@ -126,19 +244,50 @@ "source": [ "### Task 4: Applying transforms to the data\n", "\n", - "A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects (i.e., functions) and applies them to the incoming data.\n", + "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to one-hot vectors.\n", + "\n", + "- Begin by importing relevant PyTorch functions.\n", + "- Apply transformations inside `__getitem__()` function above.\n", + "\n", + "Then create a training and validation set.\n", + "\n", + " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", + " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", + " \n", + "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", + "\n", + "- Is this solution general?\n", "\n", - "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n", + "A common way of transforming inputs to neural networks is to apply a series of transforms using `torchvision.transforms.Compose`. The [ `Compose` ](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", "\n", - "- Note: here we create a training and validation set.\n", - " - We allow the model to learn directly from the training set — i.e. we fit the function to these data.\n", - " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", - "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data." + "These transforms can be very useful for mapping between file paths and tensors of images, etc.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply transforms to the data. See Task 4 exercise comments above.\n", + "\n", + "# Create train_set\n", + "\n", + "# Create valid_set\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Task 4b: \n", + "\n", + "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +566,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index 354aa95..50fc762 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -136,24 +136,42 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from typing import Optional, List, Dict, Tuple, Any\n", + "from typing import List, Tuple, Any\n", "\n", - "# import pytorch functions necessary for transformations:\n", + "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", + "# where `tensor` and `eye` are used for constructing tensors,\n", + "# and using a lower-precision float32 is advised for performance\n", "from torch import tensor, float32, eye\n", "\n", "from torch.utils.data import Dataset\n", - "from torchvision.transforms import Compose\n", - "\n", - "from pandas import DataFrame\n", "\n", "from palmerpenguins import load_penguins\n", "\n", "\n", "class PenguinDataset(Dataset):\n", + " \"\"\"Penguin dataset class.\n", + "\n", + " Parameters\n", + " ----------\n", + " input_keys : List[str]\n", + " The column titles to use in the input feature vectors.\n", + " target_keys : List[str]\n", + " The column titles to use in the target feature vectors.\n", + " train : bool\n", + " If ``True``, this object will serve as the training set, and if\n", + " ``False``, the validation set.\n", + "\n", + " Notes\n", + " -----\n", + " The validation split contains 10 male and 10 female penguins of each\n", + " species.\n", + "\n", + " \"\"\"\n", + "\n", " def __init__(\n", " self,\n", " input_keys: List[str],\n", @@ -186,9 +204,32 @@ " ]\n", "\n", " def __len__(self) -> int:\n", + " \"\"\"Return the length of requested split.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of items in the dataset.\n", + "\n", + " \"\"\"\n", " return len(self.split)\n", "\n", " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " \"\"\"Return an input-target pair.\n", + "\n", + " Parameters\n", + " ----------\n", + " idx : int\n", + " Index of the input-target pair to return.\n", + "\n", + " Returns\n", + " -------\n", + " in_feats : Any\n", + " Inputs.\n", + " target : Any\n", + " Targets.\n", + "\n", + " \"\"\"\n", " # get the row index (idx) from the dataframe and\n", " # select relevant column features (provided as input_keys)\n", " feats = tuple(self.split.iloc[idx][self.input_keys])\n", @@ -196,7 +237,7 @@ " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',)\n", " tgts = tuple(self.split.iloc[idx][self.target_keys])\n", "\n", - " # Task 4 - Exercise #1: convert the feats (Series) to PyTorch Tensors\n", + " # Task 4 - Exercise #1: convert the features to PyTorch Tensors\n", " feats = tensor(feats, dtype=float32)\n", "\n", " # Task 4 - Exercise #2: convert target to a 'one-hot' vector.\n", @@ -224,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -281,9 +322,9 @@ "source": [ "* Can we give these items to a neural network, or do they need to be transformed first?\n", " + Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html)\n", - " + The targets are tuples of strings i.e. ('Gentoo', )\n", - " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html) . \n", + " - The targets are tuples of strings i.e. ('Gentoo', )\n", + " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", " \"A\" — [1, 0, 0]\\\n", " \"B\" — [0, 1, 0]\\\n", " \"C\" — [0, 0, 1]\n" @@ -295,20 +336,76 @@ "source": [ "### Task 4: Applying transforms to the data\n", "\n", - "- Here we create a training and validation set.\n", - " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", - " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", - "- For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", + "Modify the `PenguinDataset` class above so that the tuples of numbers are converted to PyTorch `torch.Tensor` s and the string targets are converted to one-hot vectors.\n", + "\n", + "- Begin by importing relevant PyTorch functions.\n", + "- Apply transformations inside `__getitem__()` function above.\n", + "\n", + "Then create a training and validation set.\n", "\n", + " - We allow the model to learn directly from the training set—i.e. we fit the function to these data.\n", + " - During training, we monitor the model's performance on the validation set in order to check how it's doing on unseen data. Normally, people use the validation performance to determine when to stop the training process.\n", + " \n", + "For the validation set, we choose ten males and ten females of each species. This means the validation set is less likely to be biased by sex and species, and is potentially a more reliable measure of performance. You should always be _very_ careful when choosing metrics and splitting data.\n", "\n", - "Note: A common way of transforming inputs to neural networks is to apply a series of transforms using ``torchvision.transforms.Compose``. The [``Compose``](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", + "- Is this solution general?\n", + " - No. The transformations have been hardcoded. A more flexible way of transforming inputs to neural networks is to apply a series of transforms using `torchvision.transforms.Compose`. The [ `Compose` ](https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html) object takes a list of callable objects and applies them to the incoming data. See how this is done more generally in the `src/ml_workshop/_penguins.py` file. \n", "\n", "These transforms can be very useful for mapping between file paths and tensors of images, etc." ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 42.9000, 13.1000, 5000.0000, 215.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 46.1000, 13.2000, 4500.0000, 211.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 44.9000, 13.3000, 5100.0000, 213.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 43.3000, 13.4000, 4400.0000, 209.0000, 0.0000]) tensor([0., 0., 1.])\n", + "tensor([ 42.0000, 13.5000, 4150.0000, 210.0000, 0.0000]) tensor([0., 0., 1.])\n" + ] + } + ], + "source": [ + "# Apply the transforms we need to the PenguinDataset to get out input\n", + "# targets as Tensors. See Task 4 exercise comments above.\n", + "\n", + "# Create train_set\n", + "train_set = PenguinDataset(\n", + " input_keys=features,\n", + " target_keys=[\"species\"],\n", + " train=True,\n", + ")\n", + "\n", + "# Create valid_set\n", + "valid_set = PenguinDataset(\n", + " input_keys=features,\n", + " target_keys=[\"species\"],\n", + " train=False,\n", + ")\n", + "\n", + "\n", + "for _, (input_feats, target) in zip(range(5), train_set):\n", + " print(input_feats, target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Task 4b: \n", + "\n", + "Apply the `torchvision.transforms.Compose` transformations instead of hardcoding as above. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -326,15 +423,65 @@ "source": [ "from torchvision.transforms import Compose\n", "\n", + "from ml_workshop import PenguinDataset\n", + "\n", "# import some useful functions here, see https://pytorch.org/docs/stable/torch.html\n", "# where `tensor` and `eye` are used for constructing tensors,\n", "# and using a lower-precision float32 is advised for performance\n", "from torch import tensor, float32, eye\n", "\n", + "\n", + "# Apply the transforms we need to the PenguinDataset to get out input\n", + "# targets as Tensors.\n", + "\n", + "\n", + "def get_input_transforms() -> Compose:\n", + " \"\"\"Return transforms which map from raw inputs to tensors.\n", + "\n", + " Returns\n", + " -------\n", + " Compose\n", + " A composition of transforms (callable functions) to map the tuple\n", + " of input features (``Tuple[float, ...]``) to a ``torch.Tensor``.\n", + "\n", + " Notes\n", + " -----\n", + " To create a ``torch.Tensor`` we can use ``torch.tensor([1.0, 2.0, ...])``\n", + "\n", + " \"\"\"\n", + " return Compose([lambda x: tensor(x, dtype=float32)])\n", + "\n", + "\n", + "def get_target_tfms() -> Compose:\n", + " \"\"\"Return transforms which map from the raw target strings to tensor.\n", + " Returns\n", + " -------\n", + " Compose\n", + " A composition of transforms (callable functions) to map the tuple\n", + " of input features (``Tuple[str]``) to a ``torch.Tensor``.\n", + "\n", + " Notes\n", + " -----\n", + " Suppose we have three labels, \"A\", \"B\" and \"C\". We want to encoder each\n", + " distinct label as a one-hot-encoded vector. A natural way to do this is:\n", + " - \"A\" — [1, 0, 0]\n", + " - \"B\" — [0, 1, 0]\n", + " - \"C\" — [0, 0, 1]\n", + "\n", + " The transforms this function produces will return these vectors as tensors.\n", + " Note also, in the example we have just given, A's vector was the first row\n", + " in the identity matrix, B's the second, etc.\n", + "\n", + " \"\"\"\n", + " return Compose([lambda x: eye(len(target_names))[target_names.index(x[0])]])\n", + "\n", + "\n", "train_set = PenguinDataset(\n", " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=True,\n", + " x_tfms=get_input_transforms(),\n", + " y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -342,6 +489,8 @@ " input_keys=features,\n", " target_keys=[\"species\"],\n", " train=False,\n", + " x_tfms=get_input_transforms(),\n", + " y_tfms=get_target_tfms(),\n", ")\n", "\n", "\n", @@ -368,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -419,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -517,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -539,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -567,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -724,34 +873,34 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Epoch 0-25 time: 11.470797 seconds\n", - "Epoch 25-50 time: 10.322494 seconds\n", - "Epoch 50-75 time: 8.206659 seconds\n", - "Epoch 75-100 time: 9.680337 seconds\n", - "Epoch 100-125 time: 9.795119 seconds\n", - "Epoch 125-150 time: 10.032203 seconds\n", - "Epoch 150-175 time: 9.441857 seconds\n", + "Epoch 0-25 time: 8.108920 seconds\n", + "Epoch 25-50 time: 8.245825 seconds\n", + "Epoch 50-75 time: 7.894095 seconds\n", + "Epoch 75-100 time: 8.292500 seconds\n", + "Epoch 100-125 time: 7.116918 seconds\n", + "Epoch 125-150 time: 6.541059 seconds\n", + "Epoch 150-175 time: 7.708282 seconds\n", "\n", "\n", " loss_train accuracy_train loss_valid accuracy_valid\n", - "0 0.630636 0.507353 0.607762 0.500000\n", - "1 0.535187 0.742647 0.533367 0.640625\n", - "2 0.464604 0.790441 0.462660 0.671875\n", - "3 0.409597 0.797794 0.404585 0.687500\n", - "4 0.352028 0.808824 0.362779 0.687500\n", + "0 0.614220 0.452206 0.668509 0.375000\n", + "1 0.524949 0.698529 0.527703 0.703125\n", + "2 0.460917 0.786765 0.463121 0.781250\n", + "3 0.380868 0.886029 0.396204 0.828125\n", + "4 0.347903 0.878676 0.337664 0.859375\n", ".. ... ... ... ...\n", - "195 0.017349 0.992647 0.023540 0.984375\n", - "196 0.018951 0.988971 0.020961 0.984375\n", - "197 0.037160 0.985294 0.022133 0.984375\n", - "198 0.032350 0.988971 0.027387 0.984375\n", - "199 0.016947 0.992647 0.025869 0.984375\n", + "195 0.050222 0.966912 0.013005 0.984375\n", + "196 0.036788 0.985294 0.012601 1.000000\n", + "197 0.033748 0.970588 0.011316 1.000000\n", + "198 0.038716 0.988971 0.020271 0.984375\n", + "199 0.015950 0.988971 0.019603 0.984375\n", "\n", "[200 rows x 4 columns]\n" ] @@ -806,12 +955,12 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -864,7 +1013,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -876,8 +1025,8 @@ " [3.3600e+01, 1.1300e+01, 2.0000e+03, 2.1100e+02, 1.0000e+00]])\n", "\n", "Raw output:\n", - "tensor([[7.7734e-06, 3.7739e-08, 9.9999e-01],\n", - " [9.9173e-01, 2.2448e-06, 8.2675e-03]])\n", + "tensor([[8.2419e-07, 8.8322e-09, 1.0000e+00],\n", + " [6.8586e-01, 4.3171e-06, 3.1413e-01]])\n", "\n", "Predicted species:\n", "['Gentoo', 'Adelie']\n", From c50409cbbab3165f9104d7570ea16a10f53346a0 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Thu, 4 Jul 2024 18:42:12 +0100 Subject: [PATCH 5/9] Add e --- exercises/01_penguin_classification.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index e219ac6..5adb10c 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -220,7 +220,7 @@ "\n", "\n", "for features, target in data_set:\n", - " # print the features and targets her\n", + " # print the features and targets here\n", " pass" ] }, @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -287,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ From da3e1aebbed772f8a512295b5046da288e36c8d9 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Fri, 5 Jul 2024 09:38:23 +0100 Subject: [PATCH 6/9] Improve task 2 text --- exercises/01_penguin_classification.ipynb | 2 +- worked-solutions/01_penguin_classification_solutions.ipynb | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 5adb10c..ea9f855 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -57,7 +57,7 @@ "\n", "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", "\n", - "- Why is this helpful?\n", + "- Why is a class representation helpful?\n", "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index 50fc762..17735f2 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -110,15 +110,14 @@ "\n", "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", "\n", - "- Why is this class representation helpful?\n", + "- Why is a class representation helpful?\n", " - Modularity - Separation of concerns makes the cde easier to understand, maintain and test.\n", " - Maintainability - Changes are localised, therefore we only need to change a single file to update. \n", " - Abstraction - Users do not need to know how the data is read or processed, they only need to know how to interact with the class. \n", "\n", - "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", - "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", + "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n", "\n", "- Open the file ``src/ml_workshop/_penguins.py``.\n", "- Let's examine, and discuss, each of the methods together.\n", From 51d17eb91e0fb04550bf52fed16d46b5f7591d48 Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Fri, 5 Jul 2024 15:26:35 +0100 Subject: [PATCH 7/9] Improve task 3 text and remove x_tfms and y_tfms comments --- exercises/01_penguin_classification.ipynb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index ea9f855..2e83dc8 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -74,9 +74,7 @@ "- Review and discuss the class arguments.\n", " - ``input_keys``— ...\n", " - ``target_keys``— ...\n", - " - ``train``— ...\n", - " - ``x_tfms``— ...\n", - " - ``y_tfms``— ..." + " - ``train``— ..." ] }, { @@ -230,7 +228,7 @@ "source": [ "- Can we give these items to a neural network, or do they need to be transformed first?\n", " - Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html).\n", + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch; they are the PyTorch equivalent to Numpy arrays, while also providing support for GPU acceleration. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html).\n", " - The targets are tuples of strings i.e. ('Gentoo', )\n", " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", " \"A\" — [1, 0, 0]\\\n", From 7c96347cce5a54b0ef84e2f710c5be903d4d4eda Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Sat, 6 Jul 2024 16:24:54 +0100 Subject: [PATCH 8/9] Fix Task 4 comment --- exercises/01_penguin_classification.ipynb | 3 ++- worked-solutions/01_penguin_classification_solutions.ipynb | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 2e83dc8..cf532cb 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -267,7 +267,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Apply transforms to the data. See Task 4 exercise comments above.\n", + "# Apply transforms we need to PenguinDataset to convert input data and target class to tensors. \n", + "# See Task 4 exercise comments above.\n", "\n", "# Create train_set\n", "\n", diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index 17735f2..489bffc 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -371,8 +371,9 @@ } ], "source": [ - "# Apply the transforms we need to the PenguinDataset to get out input\n", - "# targets as Tensors. See Task 4 exercise comments above.\n", + "# Apply transforms we need to PenguinDataset to convert input data and target class to tensors. \n", + "# See Task 4 exercise comments above.\n", + "\n", "\n", "# Create train_set\n", "train_set = PenguinDataset(\n", From db26564ca5738786ddc3f4df17d91ce6b902bedd Mon Sep 17 00:00:00 2001 From: Matt Archer Date: Sat, 6 Jul 2024 17:42:51 +0100 Subject: [PATCH 9/9] Fix Task 3 comment with more on PyTorch Tensors --- worked-solutions/01_penguin_classification_solutions.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worked-solutions/01_penguin_classification_solutions.ipynb b/worked-solutions/01_penguin_classification_solutions.ipynb index 489bffc..25b6f49 100644 --- a/worked-solutions/01_penguin_classification_solutions.ipynb +++ b/worked-solutions/01_penguin_classification_solutions.ipynb @@ -321,7 +321,7 @@ "source": [ "* Can we give these items to a neural network, or do they need to be transformed first?\n", " + Short answer: no, we can't just pass tuples of numbers or strings to a neural network.\n", - " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html) . \n", + " - We must represent these data as ``torch.Tensor``s. This is the fundamental data abstraction used by PyTorch; they are the PyTorch equivalent to Numpy arrays. See [pytorch tensors documentation](https://pytorch.org/tutorials/beginner/introyt/tensors_deeper_tutorial.html) . \n", " - The targets are tuples of strings i.e. ('Gentoo', )\n", " - One idea is to represent as ordinal values i.e. [1] or [2] or [3]. But this implies that the class encoded by value 1 is closer to 2 than 1 is to 3. This is not desirable for categorical data. One-hot encoding avoids this by representing each species independently.\\\n", " \"A\" — [1, 0, 0]\\\n",