Dynamically select which backend to train

minhqngo · minhqngo · commit 4a2eabf509dc · 2025-09-21T10:00:57.000+07:00
diff --git a/minitorch/__init__.py b/minitorch/__init__.py
@@ -10,13 +10,15 @@
 
 from .autodiff import *  # noqa: F401,F403
 from .backends.cuda_ops import *  # noqa: F401,F403
-from .datasets import dummy_datasets  # noqa: F401,F403
+from .datasets import dummy_datasets, mnist  # noqa: F401,F403
 from .backends.fast_conv import *  # noqa: F401,F403
-
 from .backends.fast_ops import *  # noqa: F401,F403
-from .nn.module import *  # noqa: F401,F403
 
+from .nn.module import *  # noqa: F401,F403
 from .nn.nn import *  # noqa: F401,F403
 from .nn.optim import *  # noqa: F401,F403
+from .nn.layers import *
+from .nn.loss import nll_loss, bce_loss, cross_entropy_loss, mse_loss
+from .dataloader import DataLoader
 
-version = "0.4"
+version = "1.0"
diff --git a/minitorch/dataloader.py b/minitorch/dataloader.py
@@ -0,0 +1,29 @@
+import numpy as np
+from .tensor.functions import tensor
+
+
+class DataLoader:
+    def __init__(self, dataset, backend, batch_size=1, shuffle=False):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.backend = backend
+        
+    def __len__(self):
+        return int(np.ceil(len(self.dataset) / self.batch_size))
+    
+    def __iter__(self):
+        indices = np.arange(len(self.dataset))
+        if self.shuffle:
+            np.random.shuffle(indices)
+            
+        for i in range(0, len(indices), self.batch_size):
+            batch_indices = indices[i:i + self.batch_size]
+            batch_data = [self.dataset[j] for j in batch_indices]
+            
+            inputs, labels = zip(*batch_data)
+
+            inputs_tensor = tensor(list(inputs), backend=self.backend)
+            labels_tensor = tensor(list(labels), backend=self.backend)
+
+            yield inputs_tensor, labels_tensor
diff --git a/minitorch/datasets/mnist.py b/minitorch/datasets/mnist.py
@@ -0,0 +1,98 @@
+import os
+import gzip
+import shutil
+import urllib.request
+import numpy as np
+
+
+class MNISTDataset:
+    @staticmethod
+    def load_mnist_img(path):
+        try:
+            with open(path, "rb") as fi:
+                _ = int.from_bytes(fi.read(4), "big")  # magic number
+                n_images = int.from_bytes(fi.read(4), "big")
+                h = int.from_bytes(fi.read(4), "big")
+                w = int.from_bytes(fi.read(4), "big")
+                buffer = fi.read()
+                images = np.frombuffer(buffer, dtype=np.uint8).reshape(n_images, h, w)
+        except Exception as e:
+            print(f"Could not read MNIST image file at {path}")
+            print(e)
+            exit(1)
+        return images
+    
+    @staticmethod
+    def load_mnist_lbl(path):
+        try:
+            with open(path, "rb") as fi:
+                _ = int.from_bytes(fi.read(4), "big")
+                n_labels = int.from_bytes(fi.read(4), "big")
+                buffer = fi.read()
+                labels = np.frombuffer(buffer, dtype=np.uint8)
+        except Exception as e:
+            print(f"Could not read MNIST label file at {path}")
+            print(e)
+            exit(1) 
+        return labels
+    
+    @staticmethod
+    def _download_and_extract(root):
+        """
+        Downloads and extracts the MNIST dataset files if they don't exist.
+        """
+        mnist_path = os.path.join(root, "MNIST")
+        os.makedirs(mnist_path, exist_ok=True)
+        
+        urls = [
+            "https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz",
+            "https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz",
+            "https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz",
+            "https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz",
+        ]
+
+        for url in urls:
+            filename = url.split("/")[-1]
+            gz_path = os.path.join(mnist_path, filename)
+            uncompressed_path = os.path.join(mnist_path, filename[:-3])
+
+            if not os.path.exists(uncompressed_path):
+                print(f"Downloading {url}")
+                urllib.request.urlretrieve(url, gz_path)
+
+                print(f"Extracting {gz_path}")
+                with gzip.open(gz_path, 'rb') as f_in:
+                    with open(uncompressed_path, 'wb') as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                os.remove(gz_path)
+    
+    '''
+    dataset_dir
+    ├── MNIST
+        ├── train-images.idx3-ubyte (train images file)
+        ├── train-labels.idx1-ubyte
+        ├── t10k-images.idx3-ubyte (val images file)
+        ├── t10k-labels.idx1-ubyte
+    '''
+    
+    def __init__(self, root, download=True, train=True):
+        if download and not os.path.exists(os.path.join(root, "MNIST")):
+            self._download_and_extract(root)
+        
+        if train:
+            img_dir = os.path.join(root, "MNIST", "train-images-idx3-ubyte")
+            lbl_dir = os.path.join(root, "MNIST", "train-labels-idx1-ubyte")
+        else:
+            img_dir = os.path.join(root, "MNIST", "t10k-images-idx3-ubyte")
+            lbl_dir = os.path.join(root, "MNIST", "t10k-labels-idx1-ubyte")
+        
+        images = self.load_mnist_img(img_dir)
+        labels = self.load_mnist_lbl(lbl_dir)
+        
+        self.data = [(image, label) for image, label in zip(images, labels)]
+        
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        return self.data[index]
diff --git a/minitorch/nn/init.py b/minitorch/nn/init.py
@@ -1,5 +1,4 @@
 import math
-from ..tensor import tensor
 
 
 def kaiming_uniform(tensor, fan_in, **kwargs):
diff --git a/minitorch/nn/layers.py b/minitorch/nn/layers.py
@@ -5,11 +5,11 @@
 
 
 class Linear(Module):
-    def __init__(self, in_size, out_size, initializer=init.kaiming_uniform):
+    def __init__(self, in_size, out_size, backend, initializer=init.kaiming_uniform):
         super().__init__()
-        self.weights = Parameter(rand((in_size, out_size)))
+        self.weights = Parameter(rand((in_size, out_size), backend=backend))
         initializer(self.weights.value, in_size)
-        self.bias = Parameter(zeros((out_size,)))
+        self.bias = Parameter(zeros((out_size,), backend=backend))
         self.out_size = out_size
 
     def forward(self, x):
@@ -20,25 +20,25 @@ def forward(self, x):
 
 
 class Conv1d(Module):
-    def __init__(self, in_channels, out_channels, kernel_width, initializer=init.kaiming_uniform):
+    def __init__(self, in_channels, out_channels, kernel_width, backend, initializer=init.kaiming_uniform):
         super().__init__()
-        self.weights = Parameter(rand((out_channels, in_channels, kernel_width)))
+        self.weights = Parameter(rand((out_channels, in_channels, kernel_width), backend=backend))
         fan_in = in_channels * kernel_width
         initializer(self.weights.value, fan_in)
-        self.bias = Parameter(zeros((1, out_channels, 1)))
+        self.bias = Parameter(zeros((1, out_channels, 1), backend=backend))
 
     def forward(self, input):
         out = fast_conv.conv1d(input, self.weights.value) + self.bias.value
         return out
 
 
 class Conv2d(Module):
-    def __init__(self, in_channels, out_channels, kh, kw, initializer=init.kaiming_uniform):
+    def __init__(self, in_channels, out_channels, kh, kw, backend, initializer=init.kaiming_uniform):
         super().__init__()
-        self.weights = Parameter(rand((out_channels, in_channels, kh, kw)))
+        self.weights = Parameter(rand((out_channels, in_channels, kh, kw), backend=backend))
         fan_in = in_channels * kh * kw
         initializer(self.weights.value, fan_in)
-        self.bias = Parameter(zeros((out_channels, 1, 1)))
+        self.bias = Parameter(zeros((out_channels, 1, 1), backend=backend))
 
     def forward(self, input):
         out = fast_conv.conv2d(input, self.weights.value) + self.bias.value
diff --git a/minitorch/nn/loss.py b/minitorch/nn/loss.py
@@ -0,0 +1,71 @@
+from ..tensor import tensor
+from .nn import logsoftmax
+
+
+def mse_loss(y_pred, y_true):
+    """
+    Mean Squared Error Loss.
+
+    Args:
+        y_pred (Tensor): Predicted values, shape (batch_size, 1).
+        y_true (Tensor): True values, shape (batch_size, 1).
+
+    Returns:
+        Tensor: The mean squared error loss.
+    """
+    diff = y_pred - y_true
+    return (diff * diff).sum() / y_pred.shape[0]
+
+
+def nll_loss(y_pred_log_probs, y_true):
+    """
+    Negative Log-Likelihood Loss.
+
+    Args:
+        y_pred_log_probs (Tensor): Log-probabilities of predictions, shape (batch_size, num_classes).
+        y_true (Tensor): True class indices, shape (batch_size,).
+
+    Returns:
+        Tensor: The negative log-likelihood loss.
+    """
+    batch_size, num_classes = y_pred_log_probs.shape
+    
+    # Create one-hot encoded tensor for y_true
+    y_one_hot = y_pred_log_probs.zeros(y_pred_log_probs.shape)
+    y_one_hot.requires_grad_(False)
+
+    for i in range(batch_size):
+        y_one_hot[i, int(y_true[i].item())] = 1
+
+    loss = -(y_pred_log_probs * y_one_hot).sum()
+    return loss / batch_size
+
+
+def cross_entropy_loss(y_pred_logits, y_true):
+    """
+    Cross-Entropy Loss.
+
+    Args:
+        y_pred_logits (Tensor): Raw logits from the model, shape (batch_size, num_classes).
+        y_true (Tensor): True class indices, shape (batch_size,).
+
+    Returns:
+        Tensor: The cross-entropy loss.
+    """
+    log_probs = logsoftmax(y_pred_logits, dim=1)
+    return nll_loss(log_probs, y_true)
+
+
+def bce_loss(y_pred, y_true):
+    """
+    Binary Cross-Entropy Loss.
+
+    Args:
+        y_pred (Tensor): Predicted probabilities, shape (batch_size, 1).
+        y_true (Tensor): True labels (0 or 1), shape (batch_size, 1).
+
+    Returns:
+        Tensor: The binary cross-entropy loss.
+    """
+    loss = -(y_true * y_pred.log() + (1 - y_true) * (1 - y_pred).log())
+    return loss.sum() / y_pred.shape[0]
diff --git a/minitorch/nn/module.py b/minitorch/nn/module.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 from typing import Any, Dict, Optional, Sequence, Tuple
+import minitorch
+import numpy as np
 
 
 class Module:
@@ -121,6 +123,14 @@ def _addindent(s_: str, numSpaces: int) -> str:
         main_str += ")"
         return main_str
 
+    def save_weights(self, path: str) -> None:
+        weights = {name: p.value.to_numpy() for name, p in self.named_parameters()}
+        np.savez(path, **weights)
+
+    def load_weights(self, path: str) -> None:
+        weights = np.load(path)
+        for name, p in self.named_parameters():
+            p.update(minitorch.tensor(weights[name].tolist()))
 
 class Parameter:
     """
diff --git a/minitorch/nn/optim.py b/minitorch/nn/optim.py
@@ -1,4 +1,5 @@
 from typing import Sequence
+import math
 
 from .module import Parameter
 from ..scalar.scalar import Scalar
@@ -10,9 +11,11 @@ def __init__(self, parameters: Sequence[Parameter]):
 
 
 class SGD(Optimizer):
-    def __init__(self, parameters: Sequence[Parameter], lr: float = 1.0):
+    def __init__(self, parameters: Sequence[Parameter], lr: float = 1.0, momentum: float = 0.0):
         super().__init__(parameters)
         self.lr = lr
+        self.momentum = momentum
+        self.velocities = {}
 
     def zero_grad(self) -> None:
         for p in self.parameters:
@@ -29,9 +32,70 @@ def step(self) -> None:
         for p in self.parameters:
             if p.value is None:
                 continue
-            if hasattr(p.value, "derivative"):
-                if p.value.derivative is not None:
-                    p.update(Scalar(p.value.data - self.lr * p.value.derivative))
-            elif hasattr(p.value, "grad"):
-                if p.value.grad is not None:
-                    p.update(p.value - self.lr * p.value.grad)
+
+            is_scalar = hasattr(p.value, "derivative")
+            
+            grad = p.value.derivative if is_scalar and p.value.derivative is not None else p.value.grad
+            
+            if grad is None:
+                continue
+
+            if self.momentum == 0.0:
+                # Standard SGD
+                update_val = self.lr * grad
+            else:
+                # SGD with momentum
+                if p not in self.velocities:
+                    self.velocities[p] = 0.0 if is_scalar else grad * 0.0
+                
+                v = self.velocities[p]
+                v_new = self.momentum * v + grad
+                self.velocities[p] = v_new
+                update_val = self.lr * v_new
+
+            if is_scalar:
+                p.update(Scalar(p.value.data - update_val))
+            else:
+                p.update(p.value - update_val)
+
+
+class RMSProp(Optimizer):
+    def __init__(self, parameters: Sequence[Parameter], lr: float = 1e-2, decay_rate: float = 0.9, eps: float = 1e-8):
+        super().__init__(parameters)
+        self.lr = lr
+        self.decay_rate = decay_rate
+        self.eps = eps
+        self.s_vals = {}
+
+    def step(self) -> None:
+        for p in self.parameters:
+            if p.value is None:
+                continue
+
+            is_scalar = hasattr(p.value, "derivative")
+            
+            grad = p.value.derivative if is_scalar and p.value.derivative is not None else p.value.grad
+            
+            if grad is None:
+                continue
+
+            if p not in self.s_vals:
+                if is_scalar:
+                    self.s_vals[p] = 0.0
+                else:
+                    self.s_vals[p] = grad * 0.0
+            
+            s = self.s_vals[p]
+
+            s_new = self.decay_rate * s + (1 - self.decay_rate) * (grad * grad)
+            self.s_vals[p] = s_new
+
+            if is_scalar:
+                update_val = self.lr * grad / (math.sqrt(s_new) + self.eps)
+            else:
+                update_val = self.lr * grad / (s_new.sqrt() + self.eps)
+
+            if is_scalar:
+                p.update(Scalar(p.value.data - update_val))
+            else:
+                p.update(p.value - update_val)
diff --git a/project/run_mnist_multiclass.py b/project/run_mnist_multiclass.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import math`
`2`		`-from ..tensor import tensor`
`3`	`2`
`4`	`3`
`5`	`4`	`def kaiming_uniform(tensor, fan_in, **kwargs):`