""" Copyright (C), WuLab Author : Chen Date : 2021-10-11 18:49:54 Project : skeleton extraction for TeroMOL Description: python skeleton_extraction.py, RDKit (http://www.rdkit.org/) is needed. """ from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold import re def get_skeleton(smi, cat=None): mol = Chem.RWMol(Chem.MolFromSmiles(smi)) # delete aromatic flag on atoms Chem.Kekulize(mol, clearAromaticFlags=True) # search c-o, c-n bond bond = '[#6]~[!#6]' bond_mol = Chem.MolFromSmarts(bond) bs = mol.GetSubstructMatches(bond_mol) # match and break the bond for match in bs: mol.RemoveBond(match[0], match[1]) frags = Chem.GetMolFrags(mol, asMols=True) # select scaffold core_smi = "" for frag in frags: scaffold = MurckoScaffold.MakeScaffoldGeneric(frag) if not cat: if abs(Chem.MolToSmiles(scaffold, isomericSmiles=False).count('C')) > abs(core_smi.count('C')): core_smi = Chem.MolToSmiles(scaffold, isomericSmiles=False) if cat == 'Sesquiterpenoids': if abs(Chem.MolToSmiles(scaffold, isomericSmiles=False).count('C')-15) < abs(core_smi.count('C')-15): core_smi = Chem.MolToSmiles(scaffold, isomericSmiles=False) if cat == 'Diterpenoids': if abs(Chem.MolToSmiles(scaffold, isomericSmiles=False).count('C')-20) < abs(core_smi.count('C')-20): core_smi = Chem.MolToSmiles(scaffold, isomericSmiles=False) if cat == 'Triterpenoids': if abs(Chem.MolToSmiles(scaffold, isomericSmiles=False).count('C')-30) < abs(core_smi.count('C')-30): core_smi = Chem.MolToSmiles(scaffold, isomericSmiles=False) core_smi_new = re.sub('\[C.{0,2}]', 'C', core_smi) return core_smi_new if __name__ == "__main__": smi = "C=C(C)[C@@H]1C/C=C2/C(=O)O[C@@H](C[C@]3(C)CC(=O)[C@](CO)(CC(=O)C1)O3)[C@@H]2O" category = "Diterpenoids" print(get_skeleton(smi, category))