Source code for chainer_chemistry.dataset.preprocessors.mol_preprocessor

from rdkit import Chem

from chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor  # NOQA


[docs]class MolPreprocessor(BasePreprocessor): """preprocessor class specified for rdkit mol instance Args: add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """
[docs] def __init__(self, add_Hs=False, kekulize=False): super(MolPreprocessor, self).__init__() self.add_Hs = add_Hs self.kekulize = kekulize
def prepare_smiles_and_mol(self, mol): """Prepare `smiles` and `mol` used in following preprocessing. This method is called before `get_input_features` is called, by parser class. This method may be overriden to support custom `smile`/`mol` extraction Args: mol (mol): mol instance Returns (tuple): (`smiles`, `mol`) """ # Note that smiles expression is not unique. # we obtain canonical smiles which is unique in `mol` canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True) mol = Chem.MolFromSmiles(canonical_smiles) if self.add_Hs: mol = Chem.AddHs(mol) if self.kekulize: Chem.Kekulize(mol) return canonical_smiles, mol def get_label(self, mol, label_names=None): """Extracts label information from a molecule. This method extracts properties whose keys are specified by ``label_names`` from a molecule ``mol`` and returns these values as a list. The order of the values is same as that of ``label_names``. If the molecule does not have a property with some label, this function fills the corresponding index of the returned list with ``None``. Args: mol (rdkit.Chem.Mol): molecule whose features to be extracted label_names (None or iterable): list of label names. Returns: list of str: label information. Its length is equal to that of ``label_names``. If ``label_names`` is ``None``, this function returns an empty list. """ if label_names is None: return [] label_list = [] for label_name in label_names: if mol.HasProp(label_name): label_list.append(mol.GetProp(label_name)) else: label_list.append(None) # TODO(Nakago): Review implementation # Label -1 work in case of classification. # However in regression, assign -1 is not a good strategy... # label_list.append(-1) # Failed to GetProp for label, skip this case. # print('no label') # raise MolFeatureExtractionError return label_list def get_input_features(self, mol): """get molecule's feature representation, descriptor. Each subclass must override this method. Args: mol (Mol): molecule whose feature to be extracted. `mol` is prepared by the method `prepare_smiles_and_mol`. """ raise NotImplementedError def process(self, filepath): # Not used now... pass