Install padelpy

! pip install padelpy
Collecting padelpy
  Downloading https://files.pythonhosted.org/packages/4d/40/57cd08bdbb1e6f2a7f25f7495fe3b4009ee453b401fe1ba457df92bf78c6/padelpy-0.1.9-py2.py3-none-any.whl (20.9MB)
     |████████████████████████████████| 20.9MB 1.4MB/s 
Installing collected packages: padelpy
Successfully installed padelpy-0.1.9

Prepare fingerprint XML

Download fingerprint XML files

! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip
--2021-07-04 15:38:39--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2021-07-04 15:38:39--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’

fingerprints_xml.zi 100%[===================>]  10.62K  --.-KB/s    in 0s      

2021-07-04 15:38:39 (105 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFingerprinter.xml  
  inflating: EStateFingerprinter.xml  
  inflating: ExtendedFingerprinter.xml  
  inflating: Fingerprinter.xml       
  inflating: GraphOnlyFingerprinter.xml  
  inflating: KlekotaRothFingerprintCount.xml  
  inflating: KlekotaRothFingerprinter.xml  
  inflating: MACCSFingerprinter.xml  
  inflating: PubchemFingerprinter.xml  
  inflating: SubstructureFingerprintCount.xml  
  inflating: SubstructureFingerprinter.xml  

List and sort fingerprint XML files

import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files
['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

Create a dictionary

fp = dict(zip(FP_list, xml_files))
fp
{'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'Substructure': 'SubstructureFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml'}
fp['AtomPairs2D']
'AtomPairs2DFingerprinter.xml'

Load HCV dataset

import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/HCV_NS5B_Curated.csv')
df.head(2)
CMPD_CHEMBLID CANONICAL_SMILES STANDARD_TYPE RELATION STANDARD_VALUE STANDARD_UNITS pIC50 PROTEIN_ACCESSION PREF_NAME DOC_CHEMBLID ASSAY_SRC_DESCRIPTION DESCRIPTION PUBMED_ID JOURNAL YEAR VOLUME ISSUE FIRST_PAGE MOLWEIGHT ALOGP PSA NUM_RO5_VIOLATIONS Activity
0 CHEMBL179256 OC(=O)c1ccc2c(c1)nc(c3ccc(O)cc3F)n2C4CCCCC4 IC50 = 1.4 nM 8.853872 Q8JXU8 Hepatitis C virus NS5B RNA-dependent RNA polym... CHEMBL1142688 Scientific Literature Inhibitory concentration against RNA dependent... 15743173.0 J. Med. Chem. 2005 48.0 5.0 1314.0 354.37 4.93 75.35 0 Active
1 CHEMBL204350 CC(C)(C)CCN1[C@H](C(=C(C1=O)C2=NS(=O)(=O)c3ccc... IC50 = 1.7 nM 8.769551 Q8JXU8 Hepatitis C virus NS5B RNA-dependent RNA polym... CHEMBL1146957 Scientific Literature Inhibition of HCV NS5B RNA dependent RNA polym... 16455253.0 Bioorg. Med. Chem. Lett. 2006 16.0 8.0 2205.0 419.54 2.37 107.45 0 Active
df.tail(2)
CMPD_CHEMBLID CANONICAL_SMILES STANDARD_TYPE RELATION STANDARD_VALUE STANDARD_UNITS pIC50 PROTEIN_ACCESSION PREF_NAME DOC_CHEMBLID ASSAY_SRC_DESCRIPTION DESCRIPTION PUBMED_ID JOURNAL YEAR VOLUME ISSUE FIRST_PAGE MOLWEIGHT ALOGP PSA NUM_RO5_VIOLATIONS Activity
576 CHEMBL175454 OC(=O)c1ccc2c(c1)nc(c3ccccn3)n2c4ccccc4 IC50 = 360000.0 nM 3.443697 Q8JXU8 Hepatitis C virus NS5B RNA-dependent RNA polym... CHEMBL1149223 Scientific Literature Inhibitory activity against NS5B polymerase of... 14684311.0 Bioorg. Med. Chem. Lett. 2004 14.0 1.0 119.0 315.33 3.96 68.01 0 Inactive
577 CHEMBL369640 CC(C)n1c(nc2cc(ccc12)C(=O)O)c3ccccn3 IC50 = 408000.0 nM 3.389340 Q8JXU8 Hepatitis C virus NS5B RNA-dependent RNA polym... CHEMBL1149223 Scientific Literature Inhibitory activity against NS5B polymerase of... 14684311.0 Bioorg. Med. Chem. Lett. 2004 14.0 1.0 119.0 281.31 3.11 68.01 0 Inactive

Prepare data subset as input to PaDEL

df2 = pd.concat( [df['CANONICAL_SMILES'],df['CMPD_CHEMBLID']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2
CANONICAL_SMILES CMPD_CHEMBLID
0 OC(=O)c1ccc2c(c1)nc(c3ccc(O)cc3F)n2C4CCCCC4 CHEMBL179256
1 CC(C)(C)CCN1[C@H](C(=C(C1=O)C2=NS(=O)(=O)c3ccc... CHEMBL204350
2 OC(=O)c1ccc2c(c1)nc(c3ccc(O)cc3)n2C4CCCCC4 CHEMBL179257
3 OC(=O)c1ccc2c(C3CCCCC3)c([nH]c2c1)c4ccc(O)cc4 CHEMBL178784
4 CN(C)C(=O)Cn1c(c2ccc(OCc3ccccc3)cc2)c(C4CCCCC4... CHEMBL369319
... ... ...
573 CCC(CC)n1c(nc2cc(ccc12)C(=O)O)c3ccccn3 CHEMBL175762
574 Cc1sc(cc1\C(=C\C(=O)C(=O)O)\O)c2ccccc2 CHEMBL197882
575 OC(=O)c1ccc2c(c1)ncn2C3CCCCC3 CHEMBL177122
576 OC(=O)c1ccc2c(c1)nc(c3ccccn3)n2c4ccccc4 CHEMBL175454
577 CC(C)n1c(nc2cc(ccc12)C(=O)O)c3ccccn3 CHEMBL369640

578 rows × 2 columns

Calculate descriptors

There are 12 fingerprint types in PaDEL. To calculate all 12, make sure to make adjustments to the descriptortypes input argument to any of the ones in the fp dictionary variable as shown above, e.g. SubstructureFingerprintCount.xml

fp
{'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'Substructure': 'SubstructureFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml'}
fp['PubChem']
'PubchemFingerprinter.xml'
from padelpy import padeldescriptor

fingerprint = 'Substructure'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

Display calculated fingerprints

descriptors = pd.read_csv(fingerprint_output_file)
descriptors
Name SubFP1 SubFP2 SubFP3 SubFP4 SubFP5 SubFP6 SubFP7 SubFP8 SubFP9 SubFP10 SubFP11 SubFP12 SubFP13 SubFP14 SubFP15 SubFP16 SubFP17 SubFP18 SubFP19 SubFP20 SubFP21 SubFP22 SubFP23 SubFP24 SubFP25 SubFP26 SubFP27 SubFP28 SubFP29 SubFP30 SubFP31 SubFP32 SubFP33 SubFP34 SubFP35 SubFP36 SubFP37 SubFP38 SubFP39 ... SubFP268 SubFP269 SubFP270 SubFP271 SubFP272 SubFP273 SubFP274 SubFP275 SubFP276 SubFP277 SubFP278 SubFP279 SubFP280 SubFP281 SubFP282 SubFP283 SubFP284 SubFP285 SubFP286 SubFP287 SubFP288 SubFP289 SubFP290 SubFP291 SubFP292 SubFP293 SubFP294 SubFP295 SubFP296 SubFP297 SubFP298 SubFP299 SubFP300 SubFP301 SubFP302 SubFP303 SubFP304 SubFP305 SubFP306 SubFP307
0 CHEMBL204350 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 1
1 CHEMBL179256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
2 CHEMBL179257 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
3 CHEMBL178784 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
4 CHEMBL369319 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
573 CHEMBL178067 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
574 CHEMBL197882 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
575 CHEMBL177122 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
576 CHEMBL175454 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1
577 CHEMBL369640 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1

578 rows × 308 columns

Build a Random Forest Model

X = descriptors.drop('Name', axis=1)
y = df['Activity']

Remove low variance features

from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)
X
SubFP1 SubFP2 SubFP3 SubFP18 SubFP49 SubFP84 SubFP101 SubFP109 SubFP135 SubFP137 SubFP171 SubFP172 SubFP180 SubFP181 SubFP183 SubFP184 SubFP214 SubFP275
0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1
1 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 1
2 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1
3 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1
4 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
573 1 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1
574 1 1 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1
575 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1
576 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1
577 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1

578 rows × 18 columns

Data splitting

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
((462, 18), (116, 18))

Model building

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef

model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

Apply model to make prediction

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

Calculate model performance metrics

mcc_train = matthews_corrcoef(y_train, y_train_pred)
mcc_train
0.8328828266116973
mcc_test = matthews_corrcoef(y_test, y_test_pred)
mcc_test
0.5219385334290642

Cross-validation

from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(n_estimators=500, random_state=42)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
cv_scores
array([0.86021505, 0.86021505, 0.88043478, 0.83695652, 0.79347826])
mcc_cv = cv_scores.mean()
mcc_cv
0.8462599345488545
model_name = pd.Series(['Random forest'], name='Name')
mcc_train_series = pd.Series(mcc_train, name='MCC_train')
mcc_cv_series = pd.Series(mcc_cv, name='MCC_cv')
mcc_test_series = pd.Series(mcc_test, name='MCC_test')

performance_metrics = pd.concat([model_name, mcc_train_series, mcc_cv_series, mcc_test_series], axis=1)
performance_metrics
Name MCC_train MCC_cv MCC_test
0 Random forest 0.832883 0.84626 0.521939