Source code for chainer_chemistry.dataset.parsers.smiles_parser

import pandas

from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser


[docs]class SmilesParser(DataFrameParser): """smiles parser It parses `smiles_list`, which is a list of string of smiles. Args: preprocessor (BasePreprocessor): preprocessor instance postprocess_label (Callable): post processing function if necessary postprocess_fn (Callable): post processing function if necessary logger: """
[docs] def __init__(self, preprocessor, postprocess_label=None, postprocess_fn=None, logger=None): super(SmilesParser, self).__init__( preprocessor, labels=None, smiles_col='smiles', postprocess_label=postprocess_label, postprocess_fn=postprocess_fn, logger=logger)
def parse(self, smiles_list, return_smiles=False, target_index=None, return_is_successful=False): """parse `smiles_list` using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: smiles_list (list): list of strings of smiles return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ df = pandas.DataFrame({'smiles': smiles_list}) return super(SmilesParser, self).parse( df, return_smiles=return_smiles, target_index=target_index, return_is_successful=return_is_successful) def extract_total_num(self, smiles_list): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: smiles_list (list): list of strings of smiles Returns (int): total number of dataset can be parsed. """ return len(smiles_list)