"""Common preprocess method is gethered in this file"""
import numpy
from rdkit import Chem
from rdkit.Chem import rdmolops
# --- Type check ---
[docs]def type_check_num_atoms(mol, num_max_atoms=-1):
"""Check number of atoms in `mol` does not exceed `num_max_atoms`
If number of atoms in `mol` exceeds the number `num_max_atoms`, it will
raise `MolFeatureExtractionError` exception.
Args:
mol (Mol):
num_max_atoms (int): If negative value is set, not check number of
atoms.
"""
num_atoms = mol.GetNumAtoms()
if num_max_atoms >= 0 and num_atoms > num_max_atoms:
# Skip extracting feature. ignore this case.
raise MolFeatureExtractionError(
'Number of atoms in mol {} exceeds num_max_atoms {}'
.format(num_atoms, num_max_atoms))
# --- Atom preprocessing ---
[docs]def construct_atomic_number_array(mol, out_size=-1):
"""Returns atomic numbers of atoms consisting a molecule.
Args:
mol (rdkit.Chem.Mol): Input molecule.
out_size (int): The size of returned array.
If this option is negative, it does not take any effect.
Otherwise, it must be larger than the number of atoms
in the input molecules. In that case, the tail of
the array is padded with zeros.
Returns:
numpy.ndarray: an array consisting of atomic numbers
of atoms in the molecule.
"""
atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()]
n_atom = len(atom_list)
if out_size < 0:
return numpy.array(atom_list, dtype=numpy.int32)
elif out_size >= n_atom:
# 'empty' padding for atom_list
# 0 represents empty place for atom
atom_array = numpy.zeros(out_size, dtype=numpy.int32)
atom_array[:n_atom] = numpy.array(atom_list, dtype=numpy.int32)
return atom_array
else:
raise ValueError('`out_size` (={}) must be negative or '
'larger than or equal to the number '
'of atoms in the input molecules (={})'
'.'.format(out_size, n_atom))
# --- Adjacency matrix preprocessing ---
[docs]def construct_adj_matrix(mol, out_size=-1, self_connection=True):
"""Returns the adjacent matrix of the given molecule.
This function returns the adjacent matrix of the given molecule.
Contrary to the specification of
:func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`,
The diagonal entries of the returned matrix are all-one.
Args:
mol (rdkit.Chem.Mol): Input molecule.
out_size (int): The size of the returned matrix.
If this option is negative, it does not take any effect.
Otherwise, it must be larger than the number of atoms
in the input molecules. In that case, the adjacent
matrix is expanded and zeros are padded to right
columns and bottom rows.
self_connection (bool): Add self connection or not.
If True, diagonal element of adjacency matrix is filled with 1.
Returns:
adj_array (numpy.ndarray): The adjacent matrix of the input molecule.
It is 2-dimensional array with shape (atoms1, atoms2), where
atoms1 & atoms2 represent from and to of the edge respectively.
If ``out_size`` is non-negative, the returned
its size is equal to that value. Otherwise,
it is equal to the number of atoms in the the molecule.
"""
adj = rdmolops.GetAdjacencyMatrix(mol)
s0, s1 = adj.shape
if s0 != s1:
raise ValueError('The adjacent matrix of the input molecule'
'has an invalid shape: ({}, {}). '
'It must be square.'.format(s0, s1))
if self_connection:
adj = adj + numpy.eye(s0)
if out_size < 0:
adj_array = adj.astype(numpy.float32)
elif out_size >= s0:
adj_array = numpy.zeros((out_size, out_size),
dtype=numpy.float32)
adj_array[:s0, :s1] = adj
else:
raise ValueError(
'`out_size` (={}) must be negative or larger than or equal to the '
'number of atoms in the input molecules (={}).'
.format(out_size, s0))
return adj_array
def construct_discrete_edge_matrix(mol, out_size=-1):
"""Returns the edge-type dependent adjacency matrix of the given molecule.
Args:
mol (rdkit.Chem.Mol): Input molecule.
out_size (int): The size of the returned matrix.
If this option is negative, it does not take any effect.
Otherwise, it must be larger than the number of atoms
in the input molecules. In that case, the adjacent
matrix is expanded and zeros are padded to right
columns and bottom rows.
Returns:
adj_array (numpy.ndarray): The adjacent matrix of the input molecule.
It is 3-dimensional array with shape (edge_type, atoms1, atoms2),
where edge_type represents the bond type,
atoms1 & atoms2 represent from and to of the edge respectively.
If ``out_size`` is non-negative, its size is equal to that value.
Otherwise, it is equal to the number of atoms in the the molecule.
"""
if mol is None:
raise MolFeatureExtractionError('mol is None')
N = mol.GetNumAtoms()
if out_size < 0:
size = N
elif out_size >= N:
size = out_size
else:
raise ValueError(
'out_size {} is smaller than number of atoms in mol {}'
.format(out_size, N))
adjs = numpy.zeros((4, size, size), dtype=numpy.float32)
bond_type_to_channel = {
Chem.BondType.SINGLE: 0,
Chem.BondType.DOUBLE: 1,
Chem.BondType.TRIPLE: 2,
Chem.BondType.AROMATIC: 3
}
for bond in mol.GetBonds():
bond_type = bond.GetBondType()
ch = bond_type_to_channel[bond_type]
i = bond.GetBeginAtomIdx()
j = bond.GetEndAtomIdx()
adjs[ch, i, j] = 1.0
adjs[ch, j, i] = 1.0
return adjs