Scrapping previous changes/attempts to fix bug. Starting fix that allows problematic nodes to generate atomExt extensions that aren't node splitting if the optimization dimension of the regularization dictionary is more specific than the atomtype at the atom of interest being extended. For example, if the atomtype of an atom labeled *5 is [Si, F, Li, N, C, P, S] and the regulatization dictionary has an optimization dimension that narrows down these atomtypes (i.e. reg_dim_atm[0] = <N,C>), then we can allow for atomExt extensions that change *5's atomtype to be [N,C] (rather than just [N] or just [C]). This way, we have an extension that narrows down *5 to <N,C> from [Si, F, Li, N, C, P, S] but also matches all of the training reactions at the node, so the regularization information (reg_dim_atm{1]) is passed to the group.

Nora Khalil · Nora Khalil · commit 7b73d0cb571e · 2025-11-19T12:33:55.000-05:00
diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py
@@ -2958,7 +2958,6 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf,
                     elif typ == 'bondExt':
                         reg_dict[(typ, indc)][0].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order)
 
-
                 elif boo:  # this extension matches all reactions (regularization dim)
                     if typ == 'intNewBondExt' or typ == 'extNewBondExt':
                         # these are bond formation extensions, we want to expand these until we get splits
@@ -2984,11 +2983,10 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf,
                 reg_val = reg_dict[(typr, indcr)]
 
                 if first_time and parent.children == []:
-                    
-                    #parent
+                    # parent
                     if typr != 'intNewBondExt' and typr != 'extNewBondExt':  # these dimensions should be regularized
                         if typr == 'atomExt':
-                            pass #no longer passing regularization info to the parent here. Doing this instead in `extend_node`
+                            grp.atoms[indcr[0]].reg_dim_atm = list(reg_val)
                         elif typr == 'elExt':
                             grp.atoms[indcr[0]].reg_dim_u = list(reg_val)
                         elif typr == 'ringExt':
@@ -3082,49 +3080,6 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf,
             out.extend(x)
 
         return out, gave_up_split
-    
-    def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext):
-        """
-        Function takes in a parent node (`parent`), an extension node (`new_ext`) and its compliment (`comp_ext`). 
-        Reactions of the parent node are split to extension and compliment.
-        Iterating over all the reactions that fit the complimentary node, the atomtypes of each labeled atom in each reaction are saved to a dictionary `atom_labeling_in_comp_rxns`, 
-        where the key is the integer of the atom label (i.e. 5 in '*5') and the value is a set of all the atomtypes in all the complimentary reactions with that atom label.  
-
-        Additionally, when iterating over all the reactions that fit the complimentary node, the atomtypes of each unlabeled atom in each reaction are saved to a list `unlabeled_atoms_in_comp_rxns`. 
-        """
-        
-        
-        assert comp_ext is not None, "This extension does not include a complimentary node. Cannot get regularization dimensions of complimentary node."
-        
-        #divide parent reactions into the extension node and its compliment
-        rxns_from_parent = template_rxn_map[parent.label]
-        new_ext_rxns, comp_ext_rxns, _ = self._split_reactions(rxns_from_parent, new_ext)
-        
-        #for saving data
-        atom_labeling_in_comp_rxns = dict()
-        unlabeled_atoms_in_comp_rxns = []
-
-        #iterate through each complimentary rxn
-        for rxn_c in comp_ext_rxns: 
-            for reactant in rxn_c.reactants: 
-                for mol in reactant.molecule:
-                    for atm in mol.atoms:
-                        if atm.label == '':
-                            #this atom was unlabeled 
-                            unlabeled_atmtype = atm.atomtype 
-                            if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: 
-                                unlabeled_atoms_in_comp_rxns.append(unlabeled_atmtype)
-                        else: 
-                            #this is a labeled atom
-                            atm_label = int(atm.label.replace('*',''))
-                            if atm_label not in atom_labeling_in_comp_rxns.keys():
-                                atom_labeling_in_comp_rxns[atm_label] = [atm.atomtype]
-                            else: 
-                                existing_atomtypes = atom_labeling_in_comp_rxns[atm_label]
-                                existing_atomtypes.append(atm.atomtype)
-        atom_labeling_in_comp_rxns_set = {k: set(v) for k, v in atom_labeling_in_comp_rxns.items()}
-
-        return atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns
 
     def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np.inf, iter_item_cap=np.inf):
         """
@@ -3201,36 +3156,9 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np.
 
         extname = ext[2]
 
-
         if ext[3] == 'atomExt':
-            ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] #passing regularization information to the selected extension node
-            
-            #handling regularization in complement below:
-            atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns = self.get_compliment_reg_dim(parent, template_rxn_map, ext[0], ext[1])
-            
-            #regularize the atom in which the extension was performed on
-            if ext[1].atoms[ext[4][0]].label=='':
-                #extension was performed on an unlabeled atom, so pass in regularization dimensions that are at least limited to the atomtypes of all the unlabeled atoms
-                limited_atomtypes_comp = set(ext[1].atoms[ext[4][0]].atomtype).intersection(set(unlabeled_atoms_in_comp_rxns))
-                ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)]
-            else: 
-                #extension was performed on a labeled atom. For each labeled atom, we know all the atomtypes in the training reactions. Let's limit regularization dimensions to these known atomtypes
-                adjusted_index = int(ext[1].atoms[ext[4][0]].label.replace('*','')) #i.e. ext[4]= (3,), ext[4][0] = 3, ext[0].atoms[3]=<GroupAtom [*5 'N', 'C']>, ext[0].atoms[3].label = '*5'
-                ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(atom_labeling_in_comp_rxns_set[adjusted_index])]
-        
-        #make sure the rest of the atoms in the extension take on the same regularization dimensions as the parent. Ensures subgraph isomorphism. 
-        for i, parent_atm in enumerate(parent.item.atoms): 
-            if i == ext[4][0]: 
-                continue #this is the atom that the extension is focused on, handled above if the extension was an 'atomExt' extension type
-            elif parent_atm.reg_dim_atm[1]==[]:
-                continue #only take on regularization dimensions of parent if there is some
-            else: 
-                ext[0].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the extension
-                if ext[1] is not None: #check if there's a complimentary node
-                    ext[1].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the complimentary extension
-
-
-        if ext[3] == 'elExt':
+            ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype]
+        elif ext[3] == 'elExt':
             ext[0].atoms[ext[4][0]].reg_dim_u = [ext[0].atoms[ext[4][0]].radical_electrons,
                                                  ext[0].atoms[ext[4][0]].radical_electrons]
 
@@ -3318,7 +3246,6 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np.
             template_rxn_map[cextname] = comp_entries
         else:
             template_rxn_map[parent.label] = comp_entries
-
         return True
 
     def generate_tree(self, rxns=None, obj=None, thermo_database=None, T=1000.0, nprocs=1, min_splitable_entry_num=2,
@@ -3850,10 +3777,9 @@ def simple_regularization(self, node, template_rxn_map, test=True):
             self.simple_regularization(child, template_rxn_map)
 
         grp = node.item
-        parent = node.parent.item
         rxns = template_rxn_map[node.label]
 
-        R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br', 'Li']  # set of possible R elements/atoms
+        R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br']  # set of possible R elements/atoms
         R = [ATOMTYPES[x] for x in R]
 
         RnH = R[:]
@@ -3868,15 +3794,14 @@ def simple_regularization(self, node, template_rxn_map, test=True):
             for i, atm1 in enumerate(grp.atoms):
 
                 skip = False
-                if i <= len(parent.atoms)-1: #if we aren't at an atom definition that the parent node doesn't have (due to this child being an extNewBondExt type)
-                    if node.children == [] and parent.atoms[i].reg_dim_atm[1]==[]:  # if the atoms or bonds are graphically indistinguishable don't regularize
-                        bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()}
-                        for atm2 in grp.atoms:
-                            if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds):
-                                bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()}
-                                if bdpairs == bdpairs2:
-                                    skip = True
-                                    indistinguishable.append(i)
+                if node.children == []:  # if the atoms or bonds are graphically indistinguishable don't regularize
+                    bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()}
+                    for atm2 in grp.atoms:
+                        if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds):
+                            bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()}
+                            if bdpairs == bdpairs2:
+                                skip = True
+                                indistinguishable.append(i)
 
                 if not skip and atm1.reg_dim_atm[1] != [] and set(atm1.reg_dim_atm[1]) != set(atm1.atomtype):
                     atyp = atm1.atomtype
@@ -3888,14 +3813,14 @@ def simple_regularization(self, node, template_rxn_map, test=True):
 
                         vals = list(set(atyp) & set(atm1.reg_dim_atm[1]))
                         assert vals != [], 'cannot regularize to empty'
-                        #if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]):
-                        if not test:
-                            atm1.atomtype = vals
-                        else:
-                            oldvals = atm1.atomtype
-                            atm1.atomtype = vals
-                            if not self.rxns_match_node(node, rxns):
-                                atm1.atomtype = oldvals
+                        if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]):
+                            if not test:
+                                atm1.atomtype = vals
+                            else:
+                                oldvals = atm1.atomtype
+                                atm1.atomtype = vals
+                                if not self.rxns_match_node(node, rxns):
+                                    atm1.atomtype = oldvals
 
                 if not skip and atm1.reg_dim_u[1] != [] and set(atm1.reg_dim_u[1]) != set(atm1.radical_electrons):
                     if len(atm1.radical_electrons) == 1:
diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py
@@ -1571,7 +1571,7 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N
         """
         cython.declare(atoms=list, atm=GroupAtom, atm2=GroupAtom, bd=GroupBond, i=int, j=int,
                        extents=list, RnH=list, typ=list)
-
+        print('im in')
         extents = []
         if r_bonds is None:
             r_bonds = [1, 1.5, 2, 3, 4]
@@ -1684,6 +1684,7 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N
                     elif typ[0].label == 'R!H':
                         extents.extend(self.specify_atom_extensions(i, basename, list(set(atm.reg_dim_atm[0]) & set(r))))
                 else:
+                    print(set(typ), set(atm.reg_dim_atm[0]), list(set(typ) & set(atm.reg_dim_atm[0])))
                     extents.extend(self.specify_atom_extensions(i, basename, list(set(typ) & set(atm.reg_dim_atm[0]))))
             if atm.reg_dim_u == []:
                 if len(atm.radical_electrons) != 1:
@@ -1726,6 +1727,8 @@ def specify_atom_extensions(self, i, basename, r):
 
         grps = []
         Rset = set(r)
+
+        #consider node splitting        
         for item in r:
             grp = deepcopy(self)
             grpc = deepcopy(self)
@@ -1751,6 +1754,31 @@ def specify_atom_extensions(self, i, basename, r):
             grps.append(
                 (grp, grpc, basename + '_' + str(i + 1) + old_atom_type_str + '->' + item.label, 'atomExt', (i,)))
 
+        #generate an extension without node splitting
+        if len(self.atoms[i].atomtype)>len(Rset):
+            if all(r in self.atoms[i].atomtype for r in Rset): 
+                #that means even if we update the atomtype of the atom to the Rset, it will still be a specification
+                grp = deepcopy(self)
+                grp.atoms[i].atomtype = list(Rset)
+                
+                #rename
+                old_atom_type = grp.atoms[i].atomtype
+
+                if len(old_atom_type) > 1:
+                    labelList = []
+                    old_atom_type_str = ''
+                    for k in old_atom_type:
+                        labelList.append(k.label)
+                    for p in sorted(labelList):
+                        old_atom_type_str += p
+                elif len(old_atom_type) == 0:
+                    old_atom_type_str = ""
+                else:
+                    old_atom_type_str = old_atom_type[0].label
+
+                grps.append(
+                (grp, None, basename + '_' + str(i + 1) + old_atom_type_str + '->' + ''.join(r.label for r in Rset), 'atomExt', (i,)))
+       
         return grps
 
     def specify_ring_extensions(self, i, basename):