Decouple weight initialization

minhqngo · minhqngo · commit 3f3e5a31b1b2 · 2025-09-20T22:47:07.000+07:00
diff --git a/minitorch/datasets/mnist.py b/minitorch/datasets/mnist.py
diff --git a/minitorch/nn/init.py b/minitorch/nn/init.py
@@ -0,0 +1,25 @@
+import math
+from ..tensor import tensor
+
+
+def kaiming_uniform(tensor, fan_in, **kwargs):
+    bound = math.sqrt(6 / fan_in)
+    tensor.uniform_(-bound, bound)
+
+
+def glorot_uniform(tensor, fan_in, fan_out):
+    bound = math.sqrt(6 / (fan_in + fan_out))
+    tensor.uniform_(-bound, bound)
+
+
+def lecun_uniform(tensor, fan_in, **kwargs):
+    bound = math.sqrt(3 / fan_in)
+    tensor.uniform_(-bound, bound)
+
+
+def zero(tensor, **kwargs):
+    tensor.fill_(0.0)
+
+
+def one(tensor, **kwargs):
+    tensor.fill_(1.0)
diff --git a/minitorch/nn/layers.py b/minitorch/nn/layers.py
@@ -1,19 +1,15 @@
-from ..tensor.operators import TensorBackend
 from ..tensor.functions import rand, zeros
 from .module import Module, Parameter
 from ..backends import fast_conv, fast_ops
-
-BACKEND = TensorBackend(fast_ops.FastOps)
+from . import init
 
 
 class Linear(Module):
-    def __init__(self, in_size, out_size):
+    def __init__(self, in_size, out_size, initializer=init.kaiming_uniform):
         super().__init__()
-        
-        # He initialization
-        scale = (2.0 / in_size) ** 0.5
-        self.weights = Parameter(scale * rand((in_size, out_size), backend=BACKEND))
-        self.bias = Parameter(zeros((out_size,), backend=BACKEND))
+        self.weights = Parameter(rand((in_size, out_size)))
+        initializer(self.weights.value, in_size)
+        self.bias = Parameter(zeros((out_size,)))
         self.out_size = out_size
 
     def forward(self, x):
@@ -24,33 +20,25 @@ def forward(self, x):
 
 
 class Conv1d(Module):
-    def __init__(self, in_channels, out_channels, kernel_width):
+    def __init__(self, in_channels, out_channels, kernel_width, initializer=init.kaiming_uniform):
         super().__init__()
-        
-        # He initialization
+        self.weights = Parameter(rand((out_channels, in_channels, kernel_width)))
         fan_in = in_channels * kernel_width
-        scale = (2.0 / fan_in) ** 0.5
-        self.weights = Parameter(
-            scale * rand((out_channels, in_channels, kernel_width), backend=BACKEND)
-        )
-        self.bias = Parameter(zeros((1, out_channels, 1), backend=BACKEND))
+        initializer(self.weights.value, fan_in)
+        self.bias = Parameter(zeros((1, out_channels, 1)))
 
     def forward(self, input):
         out = fast_conv.conv1d(input, self.weights.value) + self.bias.value
         return out
 
 
 class Conv2d(Module):
-    def __init__(self, in_channels, out_channels, kh, kw):
+    def __init__(self, in_channels, out_channels, kh, kw, initializer=init.kaiming_uniform):
         super().__init__()
-        
-        # He initialization
+        self.weights = Parameter(rand((out_channels, in_channels, kh, kw)))
         fan_in = in_channels * kh * kw
-        scale = (2.0 / fan_in) ** 0.5
-        self.weights = Parameter(
-            scale * rand((out_channels, in_channels, kh, kw), backend=BACKEND)
-        )
-        self.bias = Parameter(zeros((out_channels, 1, 1), backend=BACKEND))
+        initializer(self.weights.value, fan_in)
+        self.bias = Parameter(zeros((out_channels, 1, 1)))
 
     def forward(self, input):
         out = fast_conv.conv2d(input, self.weights.value) + self.bias.value
diff --git a/minitorch/nn/nn.py b/minitorch/nn/nn.py
@@ -121,10 +121,10 @@ def logsoftmax(input: Tensor, dim: int) -> Tensor:
     Returns:
          log of softmax tensor
     """
-    exp_input = input.exp()
+    m = max(input, dim)
+    exp_input = (input - m).exp()
     sum_exp = exp_input.sum(dim)
-    log_sum_exp = sum_exp.log()
-    return input - log_sum_exp
+    return input - m - sum_exp.log()
 
 
 def maxpool2d(input: Tensor, kernel: Tuple[int, int]) -> Tensor:
@@ -161,5 +161,5 @@ def dropout(input: Tensor, rate: float, ignore: bool = False) -> Tensor:
     if rate == 1.0:
         return input * 0
     p_keep = 1.0 - rate
-    mask = rand(input.shape) > rate
+    mask = tensor([1.0]) - (rand(input.shape) < rate)
     return input * mask * (1.0 / p_keep)
diff --git a/minitorch/tensor/functions.py b/minitorch/tensor/functions.py
@@ -308,7 +308,7 @@ def rand(
     Returns:
         :class:`Tensor` : new tensor
     """
-    vals = [np.random.randn() for _ in range(int(common_operators.prod(shape)))]
+    vals = [np.random.uniform() for _ in range(int(common_operators.prod(shape)))]
     tensor = minitorch.Tensor.make(vals, shape, backend=backend)
     tensor.requires_grad_(requires_grad)
     return tensor
diff --git a/minitorch/tensor/tensor.py b/minitorch/tensor/tensor.py
@@ -372,3 +372,8 @@ def zero_grad_(self) -> None:  # pragma: no cover
         Reset the derivative on this variable.
         """
         self.grad = None
+
+    def uniform_(self, low=0.0, high=1.0):
+        self._tensor._storage[:] = np.random.uniform(
+            low, high, size=len(self._tensor._storage)
+        )