add GED metrics; add GMM method

hummerichsander · hummerichsander · commit 58da8380f800 · 2026-01-26T17:20:09.000+01:00
diff --git a/split_flows/models/split_flow.py b/split_flows/models/split_flow.py
@@ -14,14 +14,17 @@
 import torch.nn as nn
 from torch.distributions import Distribution
 
-from egnn_pytorch import EGNN_Network
+import mdtraj as md
+from moleculekit.molecule import Molecule
+
+from egnn_pytorch import EGNN_Network, EGNN
 from sklearn.mixture import GaussianMixture
 from hydrantic.model import Model, ModelHparams
-from moleculekit.molecule import Molecule
 
 from split_flows.utils.interpolant import Interpolation, Interpolant
 from split_flows.mixins.continuous_flow import ContinuousFlowMixin
 from split_flows.utils.utils import to_one_hot, sum_except_batch, match_dims
+from split_flows.utils.metrics import graph_edit_distance
 
 
 logging.basicConfig(level=logging.INFO)
@@ -72,9 +75,9 @@ def augment_gmm(self, R: Tensor, gmm: GaussianMixture) -> Tensor:
         :return: Full set of coordinates with noise."""
 
         z = torch.empty((R.shape[0], self.num_particles, R.shape[2]), device=R.device)
-        z_gmm = torch.tensor(
-            gmm.sample(R.shape[0])[0], dtype=R.dtype, device=R.device
-        ).view(R.shape[0], -1, 3)
+        z_gmm = torch.tensor(gmm.sample(R.shape[0])[0], dtype=R.dtype, device=R.device).view(
+            R.shape[0], -1, 3
+        )
 
         start_idx = 0
         for i, (cg_idx, noise_idx) in enumerate(self.latent_groupings):
@@ -115,9 +118,7 @@ def log_prob(self, value: Tensor) -> Tensor:
             R_cg = value[:, cg_idx, :]
             R_noise = value[:, noise_idx, :]
             exponential_term = (
-                -0.5
-                * sum_except_batch((R_noise - R_cg[:, None, :]) ** 2)
-                / self.scale**2
+                -0.5 * sum_except_batch((R_noise - R_cg[:, None, :]) ** 2) / self.scale**2
             )
             normalization_term = -torch.log(Z) * R_noise.shape[1]
             log_prob += exponential_term + normalization_term
@@ -150,13 +151,56 @@ def __init__(
         self.atom_embedding = nn.Linear(atom_types.size(-1), self.dim)
         self.bead_embedding = nn.Linear(bead_types.size(-1), self.dim)
 
+        self._init_weights()
+
     def forward(self, x: Tensor, t: Tensor) -> Tensor:
         atom_embeddings = self.atom_embedding(self.atom_types).repeat(x.size(0), 1, 1)
         bead_embeddings = self.bead_embedding(self.bead_types).repeat(x.size(0), 1, 1)
         t = match_dims(t, x).repeat(1, x.shape[1], 1)
         h = torch.cat([atom_embeddings, bead_embeddings, t], dim=-1)
         return self.net(h + torch.randn_like(h), x)[1]
 
+    def _init_weights(self):
+        """Initialize weights following EGNN best practices.
+
+        - Message MLPs (phi_e, phi_h): Xavier uniform
+        - Coordinate MLP last layer: scaled down by 0.01
+        - Biases: zero initialization
+        - Embedding layers: Xavier uniform
+        """
+
+        # Initialize embedding layers with Xavier uniform
+        nn.init.xavier_uniform_(self.atom_embedding.weight)
+        nn.init.zeros_(self.atom_embedding.bias)
+        nn.init.xavier_uniform_(self.bead_embedding.weight)
+        nn.init.zeros_(self.bead_embedding.bias)
+
+        # Initialize EGNN layers
+        for layer in self.net.layers:
+            if isinstance(layer, EGNN):
+                # Message MLPs (phi_e, phi_h) - Xavier uniform
+                for module in [layer.edge_mlp, layer.node_mlp]:
+                    for m in module.modules():
+                        if isinstance(m, nn.Linear):
+                            nn.init.xavier_uniform_(m.weight)
+                            if m.bias is not None:
+                                nn.init.zeros_(m.bias)
+
+                # Coordinate MLP - Xavier uniform for all but last layer
+                coord_mlp_layers = list(layer.coors_mlp.modules())
+                linear_layers = [m for m in coord_mlp_layers if isinstance(m, nn.Linear)]
+
+                for i, m in enumerate(linear_layers):
+                    if i == len(linear_layers) - 1:
+                        # Last layer: scale down by 0.01
+                        nn.init.xavier_uniform_(m.weight)
+                        m.weight.data *= 0.01
+                    else:
+                        nn.init.xavier_uniform_(m.weight)
+
+                    if m.bias is not None:
+                        nn.init.zeros_(m.bias)
+
 
 class SplitFlowHparams(ModelHparams):
     aa_topology_path: str
@@ -181,12 +225,14 @@ class SplitFlowHparams(ModelHparams):
 class SplitFlow(Model[SplitFlowHparams], ContinuousFlowMixin):
     hparams_schema = SplitFlowHparams
 
-    def __init__(self, thparams: SplitFlowHparams):
-        super(SplitFlow, self).__init__(thparams)
+    def __init__(self, hparams: SplitFlowHparams):
+        super(SplitFlow, self).__init__(hparams)
 
         # Load the all-atom and coarse-grained topologies
         self.mol_aa = Molecule(self.thparams.aa_topology_path)
         self.mol_cg = Molecule(self.thparams.cg_topology_path)
+        self.top_aa = md.load_topology(self.thparams.aa_topology_path)
+        self.top_cg = md.load_topology(self.thparams.cg_topology_path)
 
         # Define the CG mapping
         if not hasattr(self.thparams, "cg_map_matrix_path"):
@@ -266,9 +312,7 @@ def velocity(self, xt: Tensor, t: Tensor) -> Tensor:
 
         return self.velo_net(xt, t)
 
-    def compute_metrics(
-        self, batch: tuple[Tensor, ...], batch_idx: int
-    ) -> dict[str, Tensor]:
+    def compute_metrics(self, batch: tuple[Tensor, ...], batch_idx: int) -> dict[str, Tensor]:
         """Compute training/validation metrics.
 
         :param batch: Batch data tuple, expecting (r,) where r is a Tensor.
@@ -291,6 +335,11 @@ def compute_metrics(
         metrics["loss_fm"] = sum_except_batch(torch.pow(vt_hat - vt, 2)).mean()
         metrics["loss"] += metrics["loss_fm"]
 
+        if not self.training:
+            x1 = self.compute_flow(x0, return_intermediate=False, verbose=False)
+            traj = md.Trajectory(x1.cpu().numpy(), self.top_aa)
+            metrics["ged"] = torch.mean(torch.tensor(graph_edit_distance(traj=traj, verbose=False)))
+
         return metrics
 
     @property
@@ -322,3 +371,36 @@ def indices_split(self) -> tuple[Tensor, Tensor]:
         noise_indices = torch.tensor(noise_list, device=self.device, dtype=torch.long)
 
         return cg_indices, noise_indices
+
+    def fit_latent_gmm(
+        self,
+        r: Tensor,
+        n_components: int,
+        chunk_size: int | None = None,
+        verbose: bool = False,
+        *args,
+        **kwargs,
+    ) -> GaussianMixture:
+        """Fit a Gaussian Mixture Model to the latent representations of the fine-grained data.
+
+        :param r: fine-grained configurations
+        :param n_components: number of GMM components
+        :param chunk_size: chunk size for processing data in batches
+        :param verbose: whether to display a progress bar
+        :param args: additional arguments for GaussianMixture
+        :param kwargs: additional keyword arguments for GaussianMixture
+        :return: fitted GMM"""
+
+        from sklearn.mixture import GaussianMixture
+
+        with torch.no_grad():
+            x1 = r.to(self.device)
+            x0 = self.compute_flow(x1, reverse=True, chunk_size=chunk_size, verbose=verbose).cpu()
+            eps_sn = self.noise.to_standard_normal(x0)[:, self.indices_split[1].cpu(), :].view(
+                x0.shape[0], -1
+            )
+
+        gmm = GaussianMixture(n_components=n_components, *args, **kwargs)
+        gmm.fit(eps_sn.numpy())
+
+        return gmm
diff --git a/split_flows/utils/metrics.py b/split_flows/utils/metrics.py
@@ -0,0 +1,209 @@
+from tqdm import tqdm
+
+import mdtraj as md
+import numpy as np
+import torch
+
+
+def compute_cg_rmsd(
+    traj1: md.Trajectory, traj2: md.Trajectory, indices: list[int], splits: int = 5
+) -> list[float]:
+    """Computes the root mean squared deviation of configurations of the provided trajectories
+    in the coarse-grained space.
+
+    :param traj1: first trajectory
+    :param traj2: second trajectory
+    :param indices: indices of atoms to retain in the coarse-grained representation
+    :param splits: number of splits to average over
+    :return: list of RMSDs of coarse-grained representations per split"""
+
+    split_size = traj1.n_frames // splits
+
+    rmsd_list = []
+    for i in range(splits):
+        rmsd_list_split = []
+        start = i * split_size
+        end = (i + 1) * split_size if i < splits - 1 else traj1.n_frames
+        for j in tqdm(range(start, end)):
+            rmsd_list_split.append(md.rmsd(traj2[j], traj1[j], 0, atom_indices=indices))
+
+        rmsd_list.append(np.mean(rmsd_list_split))
+
+    return rmsd_list
+
+
+COVCUTOFFTABLE = {
+    1: 0.23,
+    2: 0.93,
+    3: 0.68,
+    4: 0.35,
+    5: 0.83,
+    6: 0.68,
+    7: 0.68,
+    8: 0.68,
+    9: 0.64,
+    10: 1.12,
+    11: 0.97,
+    12: 1.1,
+    13: 1.35,
+    14: 1.2,
+    15: 0.75,
+    16: 1.02,
+    17: 0.99,
+    18: 1.57,
+    19: 1.33,
+    20: 0.99,
+    21: 1.44,
+    22: 1.47,
+    23: 1.33,
+    24: 1.35,
+    25: 1.35,
+    26: 1.34,
+    27: 1.33,
+    28: 1.5,
+    29: 1.52,
+    30: 1.45,
+    31: 1.22,
+    32: 1.17,
+    33: 1.21,
+    34: 1.22,
+    35: 1.21,
+    36: 1.91,
+    37: 1.47,
+    38: 1.12,
+    39: 1.78,
+    40: 1.56,
+    41: 1.48,
+    42: 1.47,
+    43: 1.35,
+    44: 1.4,
+    45: 1.45,
+    46: 1.5,
+    47: 1.59,
+    48: 1.69,
+    49: 1.63,
+    50: 1.46,
+    51: 1.46,
+    52: 1.47,
+    53: 1.4,
+    54: 1.98,
+    55: 1.67,
+    56: 1.34,
+    57: 1.87,
+    58: 1.83,
+    59: 1.82,
+    60: 1.81,
+    61: 1.8,
+    62: 1.8,
+    63: 1.99,
+    64: 1.79,
+    65: 1.76,
+    66: 1.75,
+    67: 1.74,
+    68: 1.73,
+    69: 1.72,
+    70: 1.94,
+    71: 1.72,
+    72: 1.57,
+    73: 1.43,
+    74: 1.37,
+    75: 1.35,
+    76: 1.37,
+    77: 1.32,
+    78: 1.5,
+    79: 1.5,
+    80: 1.7,
+    81: 1.55,
+    82: 1.54,
+    83: 1.54,
+    84: 1.68,
+    85: 1.7,
+    86: 2.4,
+    87: 2.0,
+    88: 1.9,
+    89: 1.88,
+    90: 1.79,
+    91: 1.61,
+    92: 1.58,
+    93: 1.55,
+    94: 1.53,
+    95: 1.51,
+    96: 1.5,
+    97: 1.5,
+    98: 1.5,
+    99: 1.5,
+    100: 1.5,
+    101: 1.5,
+    102: 1.5,
+    103: 1.5,
+    104: 1.57,
+    105: 1.49,
+    106: 1.43,
+    107: 1.41,
+}
+
+
+def compute_bond_cutoff_mdtraj(topology, scale=1.3):
+    """Compute bond cutoffs for MDTraj topology"""
+    atomic_nums = [atom.element.atomic_number for atom in topology.atoms]
+    # COVCUTOFFTABLE values are in Angstroms, convert to nanometers for MDTraj
+    vdw_array = torch.Tensor([COVCUTOFFTABLE[int(el)] / 10.0 for el in atomic_nums])  # Å to nm
+
+    cutoff_array = (vdw_array[None, :] + vdw_array[:, None]) * scale
+
+    return cutoff_array
+
+
+def compute_distance_mat_mdtraj(xyz, device="cpu"):
+    """Compute distance matrix from xyz coordinates"""
+    xyz_tensor = torch.Tensor(xyz).to(device)
+    dist = (xyz_tensor[:, None, :] - xyz_tensor[None, :, :]).pow(2).sum(-1).sqrt()
+
+    return dist
+
+
+def get_bond_graphs_mdtraj(traj, frame_idx=0, device="cpu", scale=1.3):
+    """Get bond graph for a specific frame in MDTraj trajectory"""
+    xyz = traj.xyz[frame_idx]  # coordinates for specific frame
+    dist = compute_distance_mat_mdtraj(xyz, device=device)
+    cutoff = compute_bond_cutoff_mdtraj(traj.topology, scale=scale)
+    bond_mat = dist < cutoff.to(device)
+    bond_mat[np.diag_indices(traj.n_atoms)] = 0
+
+    del dist, cutoff
+
+    return bond_mat.to(torch.long).to("cpu")
+
+
+def compare_graph_mdtraj(ref_traj, traj, ref_frame=0, frame=0, scale=1.3):
+    """Compare bond graphs between two MDTraj trajectory frames"""
+    ref_bonds = get_bond_graphs_mdtraj(ref_traj, frame_idx=ref_frame, scale=scale)
+    bonds = get_bond_graphs_mdtraj(traj, frame_idx=frame, scale=scale)
+
+    diff = (bonds != ref_bonds).sum().item()
+
+    return diff
+
+
+def graph_edit_distance(
+    traj: md.Trajectory, scale: float = 1.3, verbose: bool = True
+) -> list[float]:
+    """Compare trajectory bond graphs to topology bonds."""
+    n_atoms = traj.n_atoms
+    # Create reference adjacency matrix from topology bonds
+    A_ref = np.zeros((n_atoms, n_atoms), dtype=int)
+    for bond in traj.topology.bonds:
+        i, j = bond[0].index, bond[1].index
+        A_ref[i, j] = 1
+        A_ref[j, i] = 1
+    n_bonds = A_ref.sum()
+
+    ged_list = []
+    iterator = range(traj.n_frames)
+    if verbose:
+        iterator = tqdm(iterator, desc="Computing GED")
+    for i in iterator:
+        A = get_bond_graphs_mdtraj(traj, frame_idx=i, scale=scale).numpy()
+        ged = np.abs((A - A_ref).sum()) / n_bonds
+        ged_list.append(ged)
+    return ged_list
diff --git a/split_flows/utils/utils.py b/split_flows/utils/utils.py
@@ -80,7 +80,5 @@ def gradient(
 
     if grad_outputs is None:
         grad_outputs = torch.ones_like(output).detach()
-    grad = torch.autograd.grad(
-        output, x, grad_outputs=grad_outputs, create_graph=create_graph
-    )[0]
+    grad = torch.autograd.grad(output, x, grad_outputs=grad_outputs, create_graph=create_graph)[0]
     return grad