Refactor LitModule and improve logging structure

npurson · npurson · commit 6a8b05c210fa · 2025-05-17T00:52:51.000+08:00
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # ***⚡ [TmPL](): [T]()e[mpl]()ate for [P]()ytorch [L]()ightning***
 
-![](https://img.shields.io/badge/Python-3.8%2B-blue)
-![](https://img.shields.io/badge/PyTorch-1.11%2B-red)
-![](https://img.shields.io/badge/Lightning-2.0-blue)
+![](https://img.shields.io/badge/Python-3.9%2B-blue)
+![](https://img.shields.io/badge/PyTorch-2.1%2B-red)
+![](https://img.shields.io/badge/Lightning-2.5-blue)
 ![](https://img.shields.io/badge/Hydra-1.3-lightgrey)
 
 [![](https://img.shields.io/github/license/npurson/tmpl)](LICENSE)
-![](https://img.shields.io/badge/version-v2.0-blue)
+![](https://img.shields.io/badge/version-v2.1-blue)
 
 [Lightning Docs](https://lightning.ai/docs/pytorch/stable/) &nbsp;•&nbsp;
 [Installation](#installation) &nbsp;•&nbsp;
@@ -15,44 +15,41 @@
 [Contributing](#contributing) &nbsp;•&nbsp;
 [License](#license)
 
-A template for rapid & flexible DL experimentation development, built upon [Lightning](https://lightning.ai/) & [Hydra](https://hydra.cc/) with best practice.
+A template for rapid & flexible DL experimentation development, powered by [Lightning](https://lightning.ai/) & [Hydra](https://hydra.cc/) following best practice.
 
-## What's New
-
-***v2.0*** was released on Sep 5 '23.
+<div align="center">
+<img src="assets/meme.png" width="256" height="256">
+</div>
 
 ## Installation
 
 ```
 pip install -r requirements.txt
 ```
 
-It is recommended to manually install PyTorch and Torchvision before running the installation command, referring to the official PyTorch website for [instructions](https://pytorch.org/get-started/locally/).
-
 ## Usage
 
 0. **Setup**
 
-    ```shell
+    ```bash
     export PYTHONPATH=`pwd`:$PYTHONPATH
     ```
 
 1. **Training**
 
-    ```shell
+    ```bash
     python tools/train.py [--config-name config[.yaml]] [trainer.devices=4] [data.loader.batch_size=16]
     ```
 
-    * Override the default config file with `--config-name`.
-    * You can also override any value in the loaded config from the command line, refer to the following for more infomation.
-        * https://hydra.cc/docs/tutorials/basic/your_first_app/config_file/
+    * Specify the configuration file using `--config-name`.
+    * Refer to the following for detailed information on Hydra's override syntax.
         * https://hydra.cc/docs/advanced/hydra-command-line-flags/
         * https://hydra.cc/docs/advanced/override_grammar/basic/
 
 2. **Tips for Further Development**
 
-    The code is designed to be flexible and customizable to meet your specific needs. \
-    Useful comments can be found in the source code.
+    This framework is designed for flexibility and easy customization to meet users' specific needs.
+    Useful comments and details on extending the framework can be found within the source code files.
 
 ## Reference
 
diff --git a/assets/meme.png b/assets/meme.png
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,32 +1,19 @@
 # Refer to Hydra documentation for more information about config group defaults.
 # - https://hydra.cc/docs/tutorials/basic/your_first_app/defaults/
 # - https://hydra.cc/docs/patterns/configuring_experiments/
-
 defaults:
   - datasets: cifar10
   - models: resnet50
   - schedules: 10e
   - _self_
 
 hydra:
-  mode: MULTIRUN  # refer to https://github.com/Lightning-AI/lightning/pull/11617
-  sweep:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M}
+  run:
+    dir: outputs/${now:%Y%m%d-%H%M%S}
 
-trainer:
-  # num_nodes: 1
+trainer:  # refer to https://lightning.ai/docs/pytorch/stable/common/trainer.html
   devices: 4
-  accelerator: gpu
-  strategy: ddp
-  sync_batchnorm: True
   precision: 16-mixed
-
-  # Refer to https://lightning.ai/docs/pytorch/latest/common/trainer.html for more infomation.
-  # check_val_every_n_epoch: 1
-  # log_every_n_steps: 50
-  # enable_progress_bar: False
-  # profiler: simple  # profiling measures the time consuming of all components
-
-  # TODO: Build callbacks before passed to trainer.__init__().
-  # callbacks:
-  #   GradientAccumulationScheduler(scheduling={4: 2})
+  sync_batchnorm: True
+  # enable_progress_bar: False  # will log to console and save to file (customized in build_callbacks())
+  # profiler: simple
diff --git a/configs/datasets/cifar10.yaml b/configs/datasets/cifar10.yaml
@@ -11,7 +11,7 @@ data:
     root: data/cifar-10
     download: True
   loader:
-    batch_size: 32  # batch size per GPU
+    batch_size: 32
     num_workers: 4
 
 model:
diff --git a/configs/datasets/cifar100.yaml b/configs/datasets/cifar100.yaml
diff --git a/configs/datasets/mnist.yaml b/configs/datasets/mnist.yaml
@@ -5,10 +5,10 @@ data:
     type: MNIST
     splits:
       train:
-        download: True
+        train: True
       test:
         train: False
-    root: datasets/mnist
+    root: data/mnist
     download: True
   loader:
     batch_size: 32
diff --git a/configs/schedules/10e.yaml b/configs/schedules/10e.yaml
@@ -3,7 +3,7 @@
 optimizer:
   type: AdamW
   lr: 1.0e-3
-  weight_decay: 1.0e-5
+  weight_decay: 1.0e-4
 
 scheduler:
   type: CosineAnnealingLR
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ torch
 torchvision
 lightning
 hydra-core
+rich
 tensorboard
diff --git a/tmpl/__init__.py b/tmpl/__init__.py
@@ -1,3 +1,3 @@
-from .utils import build_from_configs, pre_build_callbacks
+from .utils import build_callbacks, build_from_configs  # It has to be imported first
 from .data import build_data_loaders
 from .engine import LitModule
diff --git a/tmpl/data/build.py b/tmpl/data/build.py
@@ -1,4 +1,4 @@
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
 from torch.utils.data import DataLoader
 from torchvision import transforms as T
 
@@ -7,16 +7,18 @@
 
 
 def build_data_loaders(cfg: DictConfig):
-    cfg = cfg.copy()
     if isinstance(cfg, DictConfig):
-        OmegaConf.set_struct(cfg, False)
-    split_cfgs = cfg.datasets.pop('splits')
+        with open_dict(cfg):
+            split_cfgs = cfg.datasets.pop('splits')
+    if isinstance(split_cfgs, ListConfig):
+        split_cfgs = {split: {'split': split} for split in split_cfgs}
 
-    if cfg.datasets.type in ('CIFAR10', 'CIFAR100', 'MNIST'):
-        print(f'It seems you are using the {cfg.datasets.type} dataset from our demo config, '
-              'we automatically add ToTensor() to the pipeline only for demo. It is usually '
-              'unnecessary for your own dataset and you can modify this part as per your '
-              'requirements.')
+    if cfg.datasets.type in ('CIFAR10', 'MNIST'):
+        print(
+            f'NOTE: For demonstration using standard torchvision.datasets like {cfg.datasets.type}, '
+            'we are manually adding ToTensor() here to ensure the pipeline is runnable. '
+            'In a typical use case with your own dataset, you should generally handle it elsewhere.'
+        )
         split_cfgs = OmegaConf.to_container(split_cfgs)
         for s in split_cfgs:
             split_cfgs[s]['transform'] = T.ToTensor()
diff --git a/tmpl/data/datasets/image_folder.py b/tmpl/data/datasets/image_folder.py
@@ -23,9 +23,7 @@ class ImageFolderDataset(data.Dataset):
     def __init__(self, data_root, mode='train'):
         assert mode in ('train', 'val', 'test')
         data_root = osp.join(data_root, mode)
-        self.mode = mode
         self.classes = tuple(os.listdir(data_root))
-
         self.transforms = T.Compose([
             T.Resize((448, 448)),
             T.RandomHorizontalFlip(),
@@ -38,7 +36,7 @@ def __init__(self, data_root, mode='train'):
             cls_dir = osp.join(data_root, cls)
             for img in os.listdir(cls_dir):
                 self.data.append((osp.join(cls_dir, img), i))
-            print(f'{cls}: {os.listdir(cls_dir)}')
+            print(f'{cls}: {len(os.listdir(cls_dir))}')
 
     def __len__(self):
         return len(self.data)
diff --git a/tmpl/data/datasets/image_text_pairs.py b/tmpl/data/datasets/image_text_pairs.py
@@ -22,9 +22,6 @@ def __init__(self, data_root, num_classes, mode='train'):
         assert mode in ('train', 'val', 'test')
         data_root = osp.join(data_root, mode)
         image_root = osp.join(data_root, 'images')
-        self.mode = mode
-        cls_cnts = [[0 for _ in range(num_classes)] for _ in range(num_classes)]
-
         self.transforms = T.Compose([
             T.Resize((448, 448)),
             T.RandomHorizontalFlip(),
@@ -42,10 +39,7 @@ def __init__(self, data_root, num_classes, mode='train'):
                     osp.join(image_root, img_path),
                     sum([1 * (c + 1) if c in label else 0 for c in range(num_classes)])
                 ])
-                for c in range(num_classes):
-                    cls_cnts[c][1 if c in label else 0] += 1
-        print(f'[data] Loaded {len(self.data)} images in {mode} split.')
-        print(cls_cnts)
+        print(f'Loaded {len(self.data)} images in the {mode} split.')
 
     def __len__(self):
         return len(self.data)
diff --git a/tmpl/engine/lit_module.py b/tmpl/engine/lit_module.py
@@ -1,54 +1,90 @@
 import lightning as L
 import torch.nn as nn
 import torch.optim as optim
+from omegaconf import open_dict
 
 from .. import build_from_configs, evaluation, models
 
 
 class LitModule(L.LightningModule):
 
-    def __init__(self, *, model, optimizer, scheduler, criterion=None, evaluator=None, **kwargs):
+    def __init__(self,
+                 model,
+                 optimizer,
+                 scheduler,
+                 criterion=None,
+                 evaluator=None,
+                 **kwargs):
         super().__init__()
         self.model = build_from_configs(models, model)
         self.optimizer = optimizer
         self.scheduler = scheduler
-        self.criterion = build_from_configs(nn, criterion)
-        self.train_evaluator = build_from_configs(evaluation, evaluator)
-        self.test_evaluator = build_from_configs(evaluation, evaluator)
+        self.criterion = build_from_configs(
+            nn, criterion) if criterion else self.model.loss
+        self.evaluator = build_from_configs(evaluation, evaluator)
 
     def forward(self, x):
         return self.model(x)
 
     def _step(self, batch, evaluator=None):
         x, y = batch
         pred = self(x)
-        loss = self.criterion(pred, y) if self.criterion is not None else self.model.loss(pred, y)
+        loss = self.criterion(pred, y)
         if evaluator:
             evaluator.update(pred, y)
         return loss
 
     def training_step(self, batch, batch_idx):
-        loss = self._step(batch, self.train_evaluator)
-        self.log('train/loss', loss)
-        if self.train_evaluator:
-            self.log('train/acc', self.train_evaluator, on_epoch=True)
-        return loss
+        loss = self._step(batch)
+        if isinstance(loss, dict):
+            loss['loss'] = sum(loss.values())
+            self.log_dict({f'{k}': v for k, v in loss.items()})
+        else:
+            self.log('loss', loss)
+        return sum(loss.values()) if isinstance(loss, dict) else loss
 
     def validation_step(self, batch, batch_idx):
-        self._shared_eval(batch, 'val')
+        self._step(batch, self.evaluator)
+        if self.evaluator:
+            self.log(f'val/acc', self.evaluator, sync_dist=True)
 
     def test_step(self, batch, batch_idx):
-        self._shared_eval(batch, 'test')
-
-    def _shared_eval(self, batch, prefix):
-        loss = self._step(batch, self.test_evaluator)
-        # Lightning automatically accumulates the metric and averages it
-        # if `self.log` is inside the `validation_step` and `test_step`
-        self.log(f'{prefix}/loss', loss, sync_dist=True)
-        if self.test_evaluator:
-            self.log(f'{prefix}/acc', self.test_evaluator, sync_dist=True)
+        self._step(batch, self.evaluator)
+        if self.evaluator:
+            self.log(f'test/acc', self.evaluator, sync_dist=True)
 
     def configure_optimizers(self):
-        optimizer = build_from_configs(optim, self.optimizer, params=self.model.parameters())
-        scheduler = build_from_configs(optim.lr_scheduler, self.scheduler, optimizer=optimizer)
+        optimizer_cfg = self.optimizer
+        with open_dict(optimizer_cfg):
+            paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+        if paramwise_cfg:
+            params = []
+            pgs = [[] for _ in paramwise_cfg]
+
+            for k, v in self.named_parameters():
+                in_param_group = False
+                for i, pg_cfg in enumerate(paramwise_cfg):
+                    if 'name' in pg_cfg and pg_cfg.name in k:
+                        pgs[i].append(v)
+                        in_param_group = True
+                        break
+                if not in_param_group:
+                    params.append(v)
+        else:
+            params = self.model.parameters()
+        optimizer = build_from_configs(optim, optimizer_cfg, params=params)
+        if paramwise_cfg:
+            for pg, pg_cfg in zip(pgs, paramwise_cfg):
+                cfg = {}
+                if 'lr_mult' in pg_cfg:
+                    cfg['lr'] = optimizer_cfg.lr * pg_cfg.lr_mult
+                optimizer.add_param_group({'params': pg, **cfg})
+
+        scheduler = build_from_configs(
+            optim.lr_scheduler, self.scheduler, optimizer=optimizer)
+        if 'interval' in self.scheduler:
+            scheduler = {
+                'scheduler': scheduler,
+                'interval': self.scheduler.interval
+            }
         return {'optimizer': optimizer, 'lr_scheduler': scheduler}
diff --git a/tmpl/utils/__init__.py b/tmpl/utils/__init__.py
@@ -1 +1 @@
-from .build import build_from_configs, pre_build_callbacks
+from .build import build_callbacks, build_from_configs
diff --git a/tmpl/utils/build.py b/tmpl/utils/build.py
diff --git a/tmpl/utils/console_logger.py b/tmpl/utils/console_logger.py
diff --git a/tools/train.py b/tools/train.py