Source code for chainer_chemistry.dataset.preprocessors.common

"""Common preprocess method is gethered in this file"""

from collections import Counter
import numpy
from rdkit import Chem
from rdkit.Chem import rdmolops

from chainer_chemistry.config import MAX_ATOMIC_NUM


[docs]class MolFeatureExtractionError(Exception): pass
# --- Type check ---
[docs]def type_check_num_atoms(mol, num_max_atoms=-1): """Check number of atoms in `mol` does not exceed `num_max_atoms` If number of atoms in `mol` exceeds the number `num_max_atoms`, it will raise `MolFeatureExtractionError` exception. Args: mol (Mol): num_max_atoms (int): If negative value is set, not check number of atoms. """ num_atoms = mol.GetNumAtoms() if num_max_atoms >= 0 and num_atoms > num_max_atoms: # Skip extracting feature. ignore this case. raise MolFeatureExtractionError( 'Number of atoms in mol {} exceeds num_max_atoms {}' .format(num_atoms, num_max_atoms))
# --- Atom preprocessing ---
[docs]def construct_atomic_number_array(mol, out_size=-1): """Returns atomic numbers of atoms consisting a molecule. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of returned array. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the tail of the array is padded with zeros. Returns: numpy.ndarray: an array consisting of atomic numbers of atoms in the molecule. """ atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()] n_atom = len(atom_list) if out_size < 0: return numpy.array(atom_list, dtype=numpy.int32) elif out_size >= n_atom: # 'empty' padding for atom_list # 0 represents empty place for atom atom_array = numpy.zeros(out_size, dtype=numpy.int32) atom_array[:n_atom] = numpy.array(atom_list, dtype=numpy.int32) return atom_array else: raise ValueError('`out_size` (={}) must be negative or ' 'larger than or equal to the number ' 'of atoms in the input molecules (={})' '.'.format(out_size, n_atom))
# --- Adjacency matrix preprocessing ---
[docs]def construct_adj_matrix(mol, out_size=-1, self_connection=True): """Returns the adjacent matrix of the given molecule. This function returns the adjacent matrix of the given molecule. Contrary to the specification of :func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`, The diagonal entries of the returned matrix are all-one. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of the returned matrix. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the adjacent matrix is expanded and zeros are padded to right columns and bottom rows. self_connection (bool): Add self connection or not. If True, diagonal element of adjacency matrix is filled with 1. Returns: adj_array (numpy.ndarray): The adjacent matrix of the input molecule. It is 2-dimensional array with shape (atoms1, atoms2), where atoms1 & atoms2 represent from and to of the edge respectively. If ``out_size`` is non-negative, the returned its size is equal to that value. Otherwise, it is equal to the number of atoms in the the molecule. """ adj = rdmolops.GetAdjacencyMatrix(mol) s0, s1 = adj.shape if s0 != s1: raise ValueError('The adjacent matrix of the input molecule' 'has an invalid shape: ({}, {}). ' 'It must be square.'.format(s0, s1)) if self_connection: adj = adj + numpy.eye(s0) if out_size < 0: adj_array = adj.astype(numpy.float32) elif out_size >= s0: adj_array = numpy.zeros((out_size, out_size), dtype=numpy.float32) adj_array[:s0, :s1] = adj else: raise ValueError( '`out_size` (={}) must be negative or larger than or equal to the ' 'number of atoms in the input molecules (={}).' .format(out_size, s0)) return adj_array
def construct_discrete_edge_matrix(mol, out_size=-1): """Returns the edge-type dependent adjacency matrix of the given molecule. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of the returned matrix. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the adjacent matrix is expanded and zeros are padded to right columns and bottom rows. Returns: adj_array (numpy.ndarray): The adjacent matrix of the input molecule. It is 3-dimensional array with shape (edge_type, atoms1, atoms2), where edge_type represents the bond type, atoms1 & atoms2 represent from and to of the edge respectively. If ``out_size`` is non-negative, its size is equal to that value. Otherwise, it is equal to the number of atoms in the the molecule. """ if mol is None: raise MolFeatureExtractionError('mol is None') N = mol.GetNumAtoms() if out_size < 0: size = N elif out_size >= N: size = out_size else: raise ValueError( 'out_size {} is smaller than number of atoms in mol {}' .format(out_size, N)) adjs = numpy.zeros((4, size, size), dtype=numpy.float32) bond_type_to_channel = { Chem.BondType.SINGLE: 0, Chem.BondType.DOUBLE: 1, Chem.BondType.TRIPLE: 2, Chem.BondType.AROMATIC: 3 } for bond in mol.GetBonds(): bond_type = bond.GetBondType() ch = bond_type_to_channel[bond_type] i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() adjs[ch, i, j] = 1.0 adjs[ch, j, i] = 1.0 return adjs def construct_supernode_feature(mol, atom_array, adjs, largest_atomic_number=MAX_ATOMIC_NUM, out_size=-1): """ Construct an input feature x' for a supernode Args: mol (rdkit.Chem.Mol): Input molecule atom_array (numpy.ndarray) : array of atoms adjs (numpy.ndarray): N by N 2-way array, or |E| by N by N 3-way array where |E| is the number of edgetypes. largest_atomic_number (int) : number of unique atom maximum index out_size (int): not used... Returns: super_node_x (numpy.ndarray); 1-way array, the supernode feature. len(super_node_x) will be 2 + 2 + MAX_ATOMIC_NUM*2 for 2-way adjs, 2 + 4*2 + MAX_ATOMIC_NUM*2 for 3-way adjs """ if mol is None: raise MolFeatureExtractionError('mol is None') N = mol.GetNumAtoms() E = numpy.sum(adjs.flatten()) if E < 1.0: E = 1.0 if out_size < 0: size = N elif out_size >= N: size = out_size else: raise MolFeatureExtractionError('out_size {} is smaller than number ' 'of atoms in mol {}' .format(out_size, N)) # check the size of adjs if adjs.ndim == 2: super_node_x = numpy.zeros(2 + 2 + largest_atomic_number*2) elif adjs.ndim == 3: super_node_x = numpy.zeros(2 + 4*2 + largest_atomic_number*2) else: raise ValueError('adjs.ndim should be 2 or 3') # end if-else # number of nodes and edges super_node_x[0] = float(N) super_node_x[1] = float(E) # histogram of types of bins if adjs.ndim == 2: adjs_temp = numpy.reshape(adjs, (1, N*N)) edge_type_histo = numpy.sum(adjs_temp, axis=1) / super_node_x[1] super_node_x[2] = numpy.max(adjs_temp, axis=1) super_node_x[3] = edge_type_histo idx_bias = 3 elif adjs.ndim == 3: adjs_temp = numpy.reshape(adjs, (4, N*N)) edge_type_histo = numpy.sum(adjs_temp, axis=1) / super_node_x[1] super_node_x[2:6] = numpy.max(adjs_temp, axis=1) super_node_x[6:10] = edge_type_histo idx_bias = 9 # end if-else # histogram of types of nodes c = Counter(atom_array) keys = c.keys() values = c.values() for k, v in zip(keys, values): if k < largest_atomic_number: super_node_x[idx_bias+k] = 1.0 super_node_x[idx_bias+largest_atomic_number+k] = float(v) / float(N) else: super_node_x[idx_bias+k] = 1.0 super_node_x[idx_bias+largest_atomic_number+k] = float(v) / float(N) super_node_x = super_node_x.astype(numpy.float32) return super_node_x