openvpi
diff --git a/‎configs/nsf_hifigan_fast.yaml‎
Lines changed: 162 additions & 0 deletions b/‎configs/nsf_hifigan_fast.yaml‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎configs/nsf_hifigan_mrd.yaml‎
Lines changed: 8 additions & 2 deletions b/‎configs/nsf_hifigan_mrd.yaml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎modules/fast_D/__init__.py‎ b/‎modules/fast_D/__init__.py‎
diff --git a/‎modules/fast_D/discriminator.py‎
Lines changed: 160 additions & 0 deletions b/‎modules/fast_D/discriminator.py‎
Lines changed: 160 additions & 0 deletions
@@ -0,0 +1,162 @@
+# preprocessing
+base_config:
+  - configs/base_hifi.yaml
+
+data_input_path: []
+data_out_path: []
+val_num: 5
+
+pe: 'parselmouth' # 'parselmouth' or 'harvest'
+f0_min: 65
+f0_max: 1100
+
+aug_min: 0.9
+aug_max: 1.4
+aug_num: 1
+key_aug: false
+key_aug_prob: 0.5
+
+pc_aug: false # pc-nsf training method
+pc_aug_rate: 0.4
+pc_aug_key: 12
+
+use_stftloss: true
+loss_fft_sizes: [2048, 2048, 4096, 1024, 512, 256, 128,1024, 2048, 512]
+loss_hop_sizes: [512, 240, 480, 100, 50, 25, 12,120, 240, 50]
+loss_win_lengths: [2048, 1200, 2400, 480, 240, 120, 60,600, 1200, 240]
+lab_aux_melloss: 45
+lab_aux_stftloss: 2.5
+
+raw_data_dir: []
+binary_data_dir: null
+binarization_args:
+  num_workers: 8
+  shuffle: true
+
+DataIndexPath: data
+valid_set_name: valid
+train_set_name: train
+
+
+volume_aug: true
+volume_aug_prob: 0.5
+
+
+mel_vmin: -6. #-6.
+mel_vmax: 1.5
+
+
+audio_sample_rate: 44100
+audio_num_mel_bins: 128
+hop_size: 512            # Hop size.
+fft_size: 2048           # FFT size.
+win_size: 2048           # FFT size.
+fmin: 40
+fmax: 16000
+fmax_for_loss: null
+crop_mel_frames: 32
+
+
+
+# global constants
+
+
+# neural networks
+
+
+#model_cls: training.nsf_HiFigan_task.nsf_HiFigan
+model_args:
+  mini_nsf: true
+  noise_sigma: 0.0
+  upsample_rates: [ 8, 8, 2, 2, 2 ]
+  upsample_kernel_sizes: [ 16,16, 4, 4, 4 ]
+  upsample_initial_channel: 512
+  resblock_kernel_sizes: [ 3,7,11 ]
+  resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
+  discriminator_periods: [ 2, 3, 5, 7, 11]
+  fast_mpd_strides: [4, 4, 4]
+  fast_mpd_kernel_size: 11
+  resblock: "1"
+
+# training
+
+task_cls: training.nsf_HiFigan_fast_task.nsf_HiFigan
+
+
+#sort_by_len: true
+#optimizer_args:
+#  optimizer_cls: torch.optim.AdamW
+#  lr: 0.0001
+#  beta1: 0.9
+#  beta2: 0.98
+#  weight_decay: 0
+#lab_aux_loss: 0.5
+discriminate_optimizer_args:
+  optimizer_cls: modules.optimizer.muon.Muon_AdamW
+  lr: 0.0002
+  muon_args:
+    weight_decay: 0.03
+  adamw_args:
+    weight_decay: 0.0
+  verbose: false
+
+generater_optimizer_args:
+  optimizer_cls: modules.optimizer.muon.Muon_AdamW
+  lr: 0.0002
+  muon_args:
+    weight_decay: 0.03
+  adamw_args:
+    weight_decay: 0.0
+  verbose: false
+
+lr_scheduler_args:
+  scheduler_cls: lr_scheduler.scheduler.WarmupLR
+  warmup_steps: 5000
+  min_lr: 0.00001
+
+clip_grad_norm: 1
+accumulate_grad_batches: 1
+sampler_frame_count_grid: 6
+ds_workers: 4
+dataloader_prefetch_factor: 2
+
+batch_size: 16
+
+
+
+num_valid_plots: 100
+log_interval: 100
+num_sanity_val_steps: 2  # steps of validation at the beginning
+val_check_interval: 2000
+num_ckpt_keep: 5
+max_updates: 1000000
+permanent_ckpt_start: 200000
+permanent_ckpt_interval: 40000
+
+###########
+# pytorch lightning
+# Read https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api for possible values
+###########
+pl_trainer_accelerator: 'auto'
+pl_trainer_devices: 'auto'
+pl_trainer_precision: '32-true'
+#pl_trainer_precision: 'bf16' #please do not use bf 16
+pl_trainer_num_nodes: 1
+pl_trainer_strategy: 
+  name: auto
+  process_group_backend: nccl
+  find_unused_parameters: true
+nccl_p2p: true
+seed: 114514
+
+###########
+# finetune
+###########
+
+finetune_enabled: false
+finetune_ckpt_path: ''
+finetune_ignored_params: []
+finetune_strict_shapes: true
+
+freezing_enabled: false
+frozen_params: []
@@ -92,13 +92,19 @@ task_cls: training.nsf_HiFigan_mrd_task.nsf_HiFigan
 discriminate_optimizer_args:
   optimizer_cls: modules.optimizer.muon.Muon_AdamW
   lr: 0.0002
-  weight_decay: 0
+  muon_args:
+    weight_decay: 0.03
+  adamw_args:
+    weight_decay: 0.0
   verbose: false
 
 generater_optimizer_args:
   optimizer_cls: modules.optimizer.muon.Muon_AdamW
   lr: 0.0002
-  weight_decay: 0
+  muon_args:
+    weight_decay: 0.03
+  adamw_args:
+    weight_decay: 0.0
   verbose: false
 
 lr_scheduler_args:
 
@@ -0,0 +1,160 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+
+
+def combine_frames(x, n):
+    B, L, C = x.shape
+    num_groups = L // n
+    if num_groups == 0:
+        return torch.empty(B, 0, n * C, device=x.device, dtype=x.dtype)
+    x = x[:, :num_groups * n, :].reshape(B, num_groups, n * C)
+    return x
+  
+  
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, 'dims must be a tuple of two dimensions'
+        self.dims = dims
+
+    def forward(self, x):
+        return x.transpose(*self.dims)
+
+
+class LeakyHardFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, min_val, max_val, leak_slope):
+        if not (min_val < max_val):
+            raise ValueError("min_val must be < max_val")
+        if leak_slope < 0:
+            raise ValueError("leak_slope must be >= 0")
+        ctx.min_val = min_val
+        ctx.max_val = max_val
+        ctx.leak_slope = leak_slope
+        below_mask = x < min_val
+        any_below = torch.any(below_mask)
+        if any_below:
+            x[below_mask] = leak_slope * x[below_mask] + (1 - leak_slope) * min_val
+        above_mask = x > max_val
+        any_above = torch.any(above_mask)
+        if any_above:
+            x[above_mask] = leak_slope * x[above_mask] + (1 - leak_slope) * max_val
+        if any_below or any_above:
+            ctx.save_for_backward(below_mask | above_mask)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if len(ctx.saved_tensors) > 0:
+            mask, = ctx.saved_tensors
+            grad_output[mask] *= ctx.leak_slope
+        return grad_output, None, None, None
+        
+        
+class ATanGLU(nn.Module):
+    # ArcTan-Applies the gated linear unit function.
+    def __init__(self, dim=-1, hard_limit=False):
+        super().__init__()
+        self.dim = dim
+        self.hard_limit = hard_limit
+
+    def forward(self, x):
+        if self.hard_limit:
+            x = LeakyHardFunction.apply(x, -100, 100, 0.01)
+        # out, gate = x.chunk(2, dim=self.dim)
+        # Using torch.split instead of chunk for ONNX export compatibility.        
+        out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
+        return out * torch.atan(gate) 
+
+        
+class LYNXNet2Block(nn.Module):
+    def __init__(self, dim, expansion_factor, kernel_size=31, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * expansion_factor)
+        if float(dropout) > 0.:
+            _dropout = nn.Dropout(dropout)
+        else:
+            _dropout = nn.Identity()
+        self.net = nn.Sequential(
+            Transpose((1, 2)),
+            nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim),
+            Transpose((1, 2)),
+            nn.Linear(dim, inner_dim * 2),
+            ATanGLU(),
+            nn.Linear(inner_dim, inner_dim * 2),
+            ATanGLU(hard_limit=True),
+            nn.Linear(inner_dim, dim),
+            _dropout
+        )
+
+    def forward(self, x):
+        norm_x = F.rms_norm(x, (x.size(-1), ))
+        x = x + self.net(norm_x)
+        return x, norm_x
+
+
+class FastPD(torch.nn.Module):
+    def __init__(self, period, init_channel=8, strides=[4, 4, 4], kernel_size=11):
+        super(FastPD, self).__init__()
+        self.period = period
+        self.strides = strides
+        self.pre = nn.Linear(1, init_channel)
+        self.residual_layers = nn.ModuleList(
+            [
+                LYNXNet2Block(
+                    dim=init_channel * np.prod(strides[: i + 1]),
+                    expansion_factor=1, 
+                    kernel_size=kernel_size,
+                    dropout=0
+                )
+                for i in range(len(strides))
+            ]
+        )
+        self.post = nn.Linear(init_channel * np.prod(strides), 1)
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, _, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, 1, t // self.period, self.period)
+        x = x.permute(0, 3, 2, 1).reshape(b * self.period, t // self.period, 1)
+        
+        x = self.pre(x)
+        for i, layer in enumerate(self.residual_layers):
+            if self.strides[i] > 1:
+                x = combine_frames(x, self.strides[i])
+            x, norm_x = layer(x)
+            if i > 0:
+                fmap.append(norm_x.reshape(b, -1))
+        x = self.post(F.rms_norm(x, (x.size(-1), )))
+        x = x.reshape(b, -1)
+
+        return x, fmap
+
+     
+class FastMPD(torch.nn.Module):
+    def __init__(self,periods=None, init_channel=8, strides=[1, 2, 4, 4, 2], kernel_size=31):
+        super(FastMPD, self).__init__()
+        self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
+        self.discriminators = nn.ModuleList()
+        for period in self.periods:
+            self.discriminators.append(
+                FastPD(period, init_channel=init_channel, strides=strides, kernel_size=kernel_size))
+
+    def forward(self, y,):
+        y_d_rs = []
+        fmap_rs = []
+
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+
+        return y_d_rs,  fmap_rs