Below I construct a decision-tree model and a random-forest model to help detect a condition with an incidence of less than 0.173% among patients (173 in 100,000). The models are built using the anonymized information of more than 280,000 patients, consisting of 30 anonymized variables per patient. Each patient is classified as either having the condition 'Class'=1
, or not having it 'Class'=0
.
Author: Leonardo Espin
Date: August 8, 2019
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 35)
#the lines below get rid of the plotting library warnings
import warnings
warnings.filterwarnings("ignore")
raw=pd.read_csv('data_challenge_dataset.csv')
print(raw.shape)
print(raw.dtypes)
raw.head()
raw.describe()
print(raw.isna().sum())
print('Percentage of missing data in each column: {:.1f}%'.format(100*raw.V1.isna().sum()/len(raw)))
Therefore, all the columns have exactly the same amount of missing data. Then, if we look across columns:
naInRows=raw.isna().sum(axis=1)
print('Percentage of rows with missing data: {:.1f}%'.format(100*len(raw[raw.isna().sum(axis=1)>0])/len(raw)))
plt.figure(figsize=(16,4))
naInRows.plot()
plt.title('Missing data for each row', fontsize=14)
plt.xlabel('Row number', fontsize=14)
plt.ylabel('Number of missing', fontsize=14);
raw.Class.value_counts()
#Here I'm creating a list of features, and I separate the target variable
features=list(raw.columns)
target=features.pop()
from sklearn.model_selection import train_test_split
seed=491
train,test = train_test_split(raw,test_size=0.3,random_state=seed,stratify=raw[target])
#checking that the proportions have been maintained
print(100*train[target].value_counts()/len(train))
print(100*test[target].value_counts()/len(test))
plt.figure(figsize=(16,25))
for i in range(len(features)):
plt.subplot(8, 4, i+1)
ax=sns.distplot(train[features[i]].dropna(),kde=False,rug=False)#NaN causes problems
ax.set_yscale('log')
def segregatedPlot(df,col,grouping='Class'):
'''conveniently making segregated plots'''
#plt.figure(figsize=(9,6))
sns.distplot(df.groupby(grouping)[col].get_group(0).dropna(),kde=False,rug=False, label=grouping.capitalize()+'=0');
ax=sns.distplot(df.groupby(grouping)[col].get_group(1).dropna(),kde=False,rug=False, label=grouping.capitalize()+'=1');
ax.set_yscale('log')
plt.legend();
plt.figure(figsize=(16,30))
for i in range(len(features)):
plt.subplot(10, 3, i+1)
segregatedPlot(train,features[i])
The plots above are very illuminating with respect to what variables have might have more predictive power. Some of the variables show different distributions depending on the value of the class that a given row belongs to. I can see six different patterns in the plots above:
Because of the apparent importance of tail values on the class distribution of the target variable, I am not inclined to use classification methods that are sensitive to feature scaling (like logistic regression). So I will first try using decision trees and random forest, and see how well they perform on the the test data.
corr = train[features].corr()
plt.figure(figsize=(10,10))
#diverging colormap
cmap = sns.diverging_palette(220,10,as_cmap=True)
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
#triu_indices_from() returns the indices for the upper-triangle of arr
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5});
Given the seemingly uniform distribution of the missing data, and the variable distribution plots shown above, I use simple imputation to fill the missing data
from sklearn import tree
bench = tree.DecisionTreeClassifier(random_state=seed)
#simple imputation:
train_f=train.fillna(train.mean())
bench.fit(train_f[features],train_f[target])
print('Maximum depth = {}'.format(bench.tree_.max_depth))
print('Default score = {:.3f}'.format(bench.score(train_f[features],train_f[target])))
print('Default feature importances (sorted by importance):')
tups=[(features[_],bench.feature_importances_[_]) for _ in range(len(features))]
tups.sort(key=lambda x:x[1], reverse=True)
for tup in tups:
print('{}, importance = {:.4f}'.format(tup[0],tup[1]))
from sklearn.model_selection import StratifiedKFold, GridSearchCV
benchCV = StratifiedKFold(n_splits=6,random_state=seed)
param_test = {
'max_depth':[2,5,20,30]}#[_ for _ in range(2,30)] produces 5
dptSearch = GridSearchCV(
estimator=tree.DecisionTreeClassifier(random_state=seed),
param_grid = param_test, scoring='roc_auc',n_jobs=4,
iid=False, cv=benchCV,return_train_score=True)
dptSearch.fit(train_f[features],train_f[target])
dptSearch.best_params_, dptSearch.best_score_
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
bench = tree.DecisionTreeClassifier(random_state=seed,max_depth=5)
bench.fit(train_f[features],train_f[target])
#simple imputation of test dataset:
test_f=test.fillna(test.mean())
print('Decision-tree ROC-AUC score = {:.3f}'.format(roc_auc_score(test_f[target],bench.predict_proba(test_f[features])[:,1])))
print('Feature importances (sorted by importance):')
tups=[(features[_],bench.feature_importances_[_]) for _ in range(len(features))]
tups.sort(key=lambda x:x[1], reverse=True)
for tup in tups:
print('{}, importance = {:.4f}'.format(tup[0],tup[1]))
#the confusion matrix for the prediction is below
print('\tTn\tFp\n\tFn\tTp')
confusion_matrix(test_f[target],bench.predict(test_f[features]))
SelectFromModel
, below, to select features for the model based on importance weights. The threshold for selection calculated automatically by SelectFromModel
is 0.034483
, and the features selected automatically are V10, V11, V12, V14, V16 and V17. from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel #for selecting features for the model
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100,random_state=seed,n_jobs=4),
threshold=0.03)
sel.fit(train_f[features], train_f[target])
selected=[features[i] for i in range(len(features)) if sel.get_support()[i]]
print('The selected variables for my random forest model are: ')
for _ in selected:
print(_)
param_test2 = {
'n_estimators':[2,50,76,100]}#[_ for _ in range(2,100)] selects 76
forestSearch = GridSearchCV(
estimator=RandomForestClassifier(random_state=seed),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,
iid=False, cv=benchCV,return_train_score=True)
forestSearch.fit(train_f[selected], train_f[target])
forestSearch.best_params_, forestSearch.best_score_
#confusion matrix for the best forest model
print('\tTn\tFp\n\tFn\tTp')
confusion_matrix(test_f[target],forestSearch.predict(test_f[selected]))
from sklearn.metrics import auc #will use below, when plotting the curves
from roc_curve import get_roc_curve #I put this code in a separate file for clarity
#making an array of thresholds
thresholds=np.linspace(0,1,15)
plt.figure(figsize=(10,7))
tprate=[]
fprate=[]
aucs=[]
counter=0
for tr_fold, test_fold in benchCV.split(train_f[selected],train_f[target]):
bench.fit(train_f[selected].iloc[tr_fold],train_f[target].iloc[tr_fold])
probs=bench.predict_proba(train_f[selected].iloc[test_fold])[:,1]#prob of class 1
tpr,fpr=get_roc_curve(train_f[target].iloc[test_fold].values,probs,thresholds)
tprate.append(tpr)
fprate.append(fpr)
auc_fold=auc(fpr, tpr)
aucs.append(auc_fold)
plt.plot(fpr,tpr,lw=1,alpha=0.3,
label='Fold {}, AUC={:.3f}'.format(counter,auc_fold))
#plt.plot(fpr[index],tpr[index],'r+');
counter+=1
#chance curve
plt.plot([0, 1], [0, 1],linestyle='--',lw=1, alpha=.8,
color='r',label='Chance')
mean_tpr = np.mean(tprate, axis=0)
mean_fpr = np.mean(fprate, axis=0)
mean_auc = np.mean(aucs)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr,lw=2, color='b',
label='Mean ROC (AUC={:.3f} $\pm$ {:.3f})'.format(mean_auc, std_auc))
#the cross shows the value corresponding to an standard threshold of 0.5
index=np.where(thresholds ==0.5)[0][0]#assuming that 0.5 is present in thresholds
plt.plot(mean_fpr[index],mean_tpr[np.where(thresholds ==0.5)[0][0]],'r+')
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC decision-tree model', fontsize=18)
plt.legend(loc="lower right", prop={'size': 12});
forest = RandomForestClassifier(random_state=seed,n_estimators=76)
plt.figure(figsize=(10,7))
tpr_forest=[]
fpr_forest=[]
aucs_forest=[]
counter=0
for tr_fold, test_fold in benchCV.split(train_f[selected],train_f[target]):
forest.fit(train_f[selected].iloc[tr_fold],train_f[target].iloc[tr_fold])
probs=forest.predict_proba(train_f[selected].iloc[test_fold])[:,1]#prob of class 1
tpr,fpr=get_roc_curve(train_f[target].iloc[test_fold].values,probs,thresholds)
tpr_forest.append(tpr)
fpr_forest.append(fpr)
auc_fold=auc(fpr, tpr)
aucs_forest.append(auc_fold)
plt.plot(fpr,tpr,lw=1,alpha=0.3,
label='Fold {}, AUC={:.3f}'.format(counter,auc_fold))
#plt.plot(fpr[index],tpr[index],'r+');
counter+=1
#chance curve
plt.plot([0, 1], [0, 1],linestyle='--',lw=1, alpha=.8,
color='r',label='Chance')
mean_tpr_f = np.mean(tpr_forest, axis=0)
mean_fpr_f = np.mean(fpr_forest, axis=0)
mean_auc_f = np.mean(aucs_forest)
std_auc_f = np.std(aucs_forest)
plt.plot(mean_fpr_f, mean_tpr_f,lw=2, color='b',
label='Mean ROC (AUC={:.3f} $\pm$ {:.3f})'.format(mean_auc_f, std_auc_f))
#the cross shows the value corresponding to an standard threshold of 0.5
index=np.where(thresholds ==0.5)[0][0]#assuming that 0.5 is present in thresholds
plt.plot(mean_fpr_f[index],mean_tpr_f[index],'r+')
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC random-forest model', fontsize=18)
plt.legend(loc="lower right", prop={'size': 12});
#comparing the two models
plt.figure(figsize=(10,7))
#decision tree ROC
plt.plot(mean_fpr, mean_tpr,lw=2,
label='Dtree Mean ROC (AUC={:.2f} $\pm$ {:.2f})'.format(mean_auc, std_auc))
plt.plot(mean_fpr_f, mean_tpr_f,lw=2,
label='Rforest Mean ROC (AUC={:.2f} $\pm$ {:.2f})'.format(mean_auc_f, std_auc_f))
#chance curve
plt.plot([0, 1], [0, 1],linestyle='--',lw=1, alpha=.8,
color='r',label='Chance')
plt.plot(mean_fpr[index],mean_tpr[index],'r+')
plt.plot(mean_fpr_f[index],mean_tpr_f[index],'g+')
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC for the two models', fontsize=18)
plt.legend(loc="lower right", prop={'size': 12});
rates=pd.concat([pd.Series(thresholds,name='Thresholds'),
pd.Series(mean_tpr,name='tree_tpr'),
pd.Series(mean_fpr,name='tree_fpr'),
pd.Series(mean_tpr_f,name='forest_tpr'),
pd.Series(mean_fpr_f,name='forest_fpr')
],axis=1)
rates