Source code for chainer_chemistry.datasets.tox21

from logging import getLogger
import os
import shutil
import zipfile

from chainer.dataset import download
import numpy

from chainer_chemistry.dataset.parsers.sdf_file_parser import SDFFileParser
from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA


_config = {
    'train': {
        'url': 'https://tripod.nih.gov/tox21/challenge/download?'
        'id=tox21_10k_data_allsdf',
        'filename': 'tox21_10k_data_all.sdf'
    },
    'val': {
        'url': 'https://tripod.nih.gov/tox21/challenge/download?'
        'id=tox21_10k_challenge_testsdf',
        'filename': 'tox21_10k_challenge_test.sdf'
    },
    'test': {
        'url': 'https://tripod.nih.gov/tox21/challenge/download?'
        'id=tox21_10k_challenge_scoresdf',
        'filename': 'tox21_10k_challenge_score.sdf'
    }
}

_root = 'pfnet/chainer/tox21'

_label_names = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                'SR-HSE', 'SR-MMP', 'SR-p53']


def get_tox21_label_names():
    """Returns label names of Tox21 datasets."""
    return _label_names


[docs]def get_tox21(preprocessor=None, labels=None, return_smiles=False,
              train_target_index=None, val_target_index=None,
              test_target_index=None):
    """Downloads, caches and preprocesses Tox21 dataset.

    Args:
        preprocesssor (BasePreprocessor): Preprocessor.
            This should be chosen based on the network to be trained.
            If it is None, default `AtomicNumberPreprocessor` is used.
        labels (str or list): List of target labels.
        return_smiles (bool): If set to True, smiles array is also returned.
        train_target_index (list or None): target index list to partially
            extract train dataset. If None (default), all examples are parsed.
        val_target_index (list or None): target index list to partially
            extract val dataset. If None (default), all examples are parsed.
        test_target_index (list or None): target index list to partially
            extract test dataset. If None (default), all examples are parsed.

    Returns:
        The 3-tuple consisting of train, validation and test
        datasets, respectively. Each dataset is composed of `features`,
        which depends on `preprocess_method`.
    """
    labels = labels or get_tox21_label_names()
    if isinstance(labels, str):
        labels = [labels, ]

    def postprocess_label(label_list):
        # Set -1 to the place where the label is not found,
        # this corresponds to not calculate loss with `sigmoid_cross_entropy`
        t = numpy.array([-1 if label is None else label for label in
                         label_list], dtype=numpy.int32)
        return t

    if preprocessor is None:
        preprocessor = AtomicNumberPreprocessor()
    parser = SDFFileParser(preprocessor,
                           postprocess_label=postprocess_label,
                           labels=labels)

    train_result = parser.parse(
        get_tox21_filepath('train'), return_smiles=return_smiles,
        target_index=train_target_index
    )
    val_result = parser.parse(
        get_tox21_filepath('val'), return_smiles=return_smiles,
        target_index=val_target_index
    )

    test_result = parser.parse(
        get_tox21_filepath('test'), return_smiles=return_smiles,
        target_index=test_target_index
    )

    if return_smiles:
        train, train_smiles = train_result['dataset'], train_result['smiles']
        val, val_smiles = val_result['dataset'], val_result['smiles']
        test, test_smiles = test_result['dataset'], test_result['smiles']
        return train, val, test, train_smiles, val_smiles, test_smiles
    else:
        train = train_result['dataset']
        val = val_result['dataset']
        test = test_result['dataset']
        return train, val, test


def _get_tox21_filepath(dataset_type):
    """Returns a file path in which the tox21 dataset is cached.

    This function returns a file path in which `dataset_type`
    of the tox21 dataset is cached.
    Note that this function does not check if the dataset has actually
    been downloaded or not.

    Args:
        dataset_type(str): Name of the target dataset type.
            Either 'train', 'val', or 'test'.

    Returns (str): file path for the tox21 dataset

    """
    if dataset_type not in _config.keys():
        raise ValueError("Invalid dataset type '{}'. Accepted values are "
                         "'train', 'val' or 'test'.".format(dataset_type))

    c = _config[dataset_type]
    sdffile = c['filename']

    cache_root = download.get_dataset_directory(_root)
    cache_path = os.path.join(cache_root, sdffile)
    return cache_path


def get_tox21_filepath(dataset_type, download_if_not_exist=True):
    """Returns a file path in which the tox21 dataset is cached.

    This function returns a file path in which `dataset_type`
    of the tox21 dataset is or will be cached.

    If the dataset is not cached and if ``download_if_not_exist``
    is ``True``, this function also downloads the dataset.

    Args:
        dataset_type: Name of the target dataset type.
            Either 'train', 'val', or 'test'
        download_if_not_exist (bool): If `True` download dataset
            if it is not downloaded yet.

    Returns (str): file path for tox21 dataset

    """
    cache_filepath = _get_tox21_filepath(dataset_type)
    if not os.path.exists(cache_filepath):
        if download_if_not_exist:
            is_successful = _download_and_extract_tox21(dataset_type,
                                                        cache_filepath)
            if not is_successful:
                logger = getLogger(__name__)
                logger.warning('Download failed.')
    return cache_filepath


def _download_and_extract_tox21(config_name, save_filepath):
    is_successful = False
    c = _config[config_name]
    url = c['url']
    sdffile = c['filename']

    # Download tox21 dataset
    download_file_path = download.cached_download(url)

    # Extract zipfile to get sdffile
    with zipfile.ZipFile(download_file_path, 'r') as z:
        z.extract(sdffile)
        shutil.move(sdffile, save_filepath)

    is_successful = True
    return is_successful


def download_and_extract_tox21():
    """Downloads and extracts Tox21 dataset.

    Returns: None

    """
    for config in ['train', 'val', 'test']:
        _download_and_extract_tox21(config, _get_tox21_filepath(config))