Calculating molecular fingerprints using padelpy
A tutorial showing how to use padelpy for calculating molecular fingerprints and its subsequent use for machine learning model building.
- Install padelpy
- Prepare fingerprint XML
- Load HCV dataset
- Prepare data subset as input to PaDEL
- Calculate descriptors
- Display calculated fingerprints
- Build a Random Forest Model
Hi friends, I've just released a new YouTube video: How to build #machinelearning models for #drugdiscovery using #PaDELPyhttps://t.co/zGMcvaDdk1#66daysofdata #100daysofcode #bioinformatics #cheminformatics #qsar #qspr #python pic.twitter.com/B7vDy1eFfz
— Data Professor (@thedataprof) July 5, 2021
! pip install padelpy
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files
FP_list = ['AtomPairs2DCount',
'AtomPairs2D',
'EState',
'CDKextended',
'CDK',
'CDKgraphonly',
'KlekotaRothCount',
'KlekotaRoth',
'MACCS',
'PubChem',
'SubstructureCount',
'Substructure']
fp = dict(zip(FP_list, xml_files))
fp
fp['AtomPairs2D']
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/HCV_NS5B_Curated.csv')
df.head(2)
df.tail(2)
df2 = pd.concat( [df['CANONICAL_SMILES'],df['CMPD_CHEMBLID']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2
fp
fp['PubChem']
from padelpy import padeldescriptor
fingerprint = 'Substructure'
fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]
padeldescriptor(mol_dir='molecule.smi',
d_file=fingerprint_output_file, #'Substructure.csv'
#descriptortypes='SubstructureFingerprint.xml',
descriptortypes= fingerprint_descriptortypes,
detectaromaticity=True,
standardizenitro=True,
standardizetautomers=True,
threads=2,
removesalt=True,
log=True,
fingerprints=True)
descriptors = pd.read_csv(fingerprint_output_file)
descriptors
X = descriptors.drop('Name', axis=1)
y = df['Activity']
from sklearn.feature_selection import VarianceThreshold
def remove_low_variance(input_data, threshold=0.1):
selection = VarianceThreshold(threshold)
selection.fit(input_data)
return input_data[input_data.columns[selection.get_support(indices=True)]]
X = remove_low_variance(X, threshold=0.1)
X
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
mcc_train = matthews_corrcoef(y_train, y_train_pred)
mcc_train
mcc_test = matthews_corrcoef(y_test, y_test_pred)
mcc_test
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=500, random_state=42)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
cv_scores
mcc_cv = cv_scores.mean()
mcc_cv
model_name = pd.Series(['Random forest'], name='Name')
mcc_train_series = pd.Series(mcc_train, name='MCC_train')
mcc_cv_series = pd.Series(mcc_cv, name='MCC_cv')
mcc_test_series = pd.Series(mcc_test, name='MCC_test')
performance_metrics = pd.concat([model_name, mcc_train_series, mcc_cv_series, mcc_test_series], axis=1)
performance_metrics