For the most part, I settled on Linear Discriminant Analysis because
Plots and results
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
import brewer2mpl
from IPython.display import HTML
DEBUG=False
def strat_map(y):
"""
Returns permuted indices that maintain class
Used in cross validation
"""
smap = defaultdict(list)
for i,v in enumerate(y):
smap[v].append(i)
for values in smap.values():
random.shuffle(values)
y_map = np.zeros_like(y)
for i,v in enumerate(y):
y_map[i] = smap[v].pop()
return y_map
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
print "Original import"
print elisa_raw.head()
elisa_raw = elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
print "Remove garbage columns"
print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
print "Give columns better names"
print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add binary column denoting GBM/not GBM"
print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
try:
label, idt = name.split('-')
except ValueError as e:
return 2
if label == 'Healthy':
try:
int(idt)
return 0
except ValueError:
return 1
raise Exception("Should not get here")
labels = map(source, elisa.index)
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add integer column denoting data source"
print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
set1 = brewer2mpl.get_map('Set3', 'qualitative', 12).mpl_colors
fig = plt.figure(figsize=(11,8))
fig.subplots_adjust(wspace = 0)
fig.subplots_adjust(hspace = 0)
fig.suptitle( 'AUCROC of k-fold cross validation\n using Linear Discriminant Analysis', fontsize=18, alpha=.9 )
fig.subplots_adjust(top=.9)
for nfolds in range(2,11):
ax = fig.add_subplot(3,3, nfolds-1)
clf = LDA()
X = elisa[proteins].values
Y = np.array([0 if a else 1 for a in elisa['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
sm = strat_map(y)
ss = sklearn.preprocessing.StandardScaler()
skf = StratifiedKFold(Y, nfolds)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
#fig = plt.figure(figsize=(4,4), dpi=72)
for i, (train, test) in enumerate(skf):
test,train = sm[test], sm[train] #shuffling
probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
ax.plot(fpr, tpr, lw=1, label='f%d(%0.2f)' % (i+1, roc_auc), color=set1[i if i!=1 else 10])
ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(skf)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
ax.plot(mean_fpr, mean_tpr, 'k--',
label='Mean(%0.2f)' % mean_auc, lw=2)
ax.set_xlim([-0.05, 1.05])
ax.set_ylim([-0.05, 1.05])
if nfolds in [8,9,10]:
ax.set_xlabel('FPR')
else:
for tl_i, label in enumerate(ax.xaxis.get_ticklabels()):
label.set_visible(False)
if nfolds in [2,5,8]:
ax.set_ylabel('TPR')
else:
for tl_i, label in enumerate(ax.yaxis.get_ticklabels()):
label.set_visible(False)
if nfolds>6:
ncols = 2
else:
ncols = 1
ax.legend(loc="lower right", fontsize=8, ncol=ncols, title='%i-fold CV' % nfolds, framealpha=.5)
fig.savefig("figures/AUCROC-LDA.pdf")
HTML('<div id="ROC">PDF in figures/AUCROC-LDA.pdf</div>')
This is insanely good classification. The whole reason I am running so many difference CVs is because the results are so ridiculously good that the results need to be exhaustive.
roc_auc_store = []
acc_store = []
if DEBUG:
N = 50 #if debugging lets not take an hour
else:
N = 10000
nfolds = 10
for _ in range(N):
clf = LDA()
X = elisa[proteins].values
Y = np.array([0 if a else 1 for a in elisa['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
sm = strat_map(y)
ss = sklearn.preprocessing.StandardScaler()
skf = StratifiedKFold(Y, nfolds)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
#for storing predictions for accuracy
y_pred_total = np.zeros_like(y)
probas = np.zeros(len(y))
for i, (train, test) in enumerate(skf):
test,train = sm[test], sm[train] #shuffling, it is ridiculous that sklearn does not do this
clf.fit(X[train], y[train])
probas_ = clf.predict_proba(X[test])
y_pred = clf.predict(X[test])
y_pred_total[test] = clf.predict(X[test])
#acc_store.append( accuracy_score(y[test] ,y_pred) )
# Compute ROC curve and area the curve
probas[test] = probas_[:, 1]
#fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
#roc_auc_store.append(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(y, probas)
roc_auc_store.append(auc(fpr, tpr))
acc_store.append(accuracy_score(y, y_pred_total))
roc_auc_store = np.array(roc_auc_store)
acc_store = np.array(acc_store)
st = "After %i iterations of %i-fold cross validation using LDA" % (N, nfolds)
st += " the mean ROCAUC is %.4f +/- %.3f" % (roc_auc_store.mean(), roc_auc_store.std())
st += " and the mean accuracy is %.4f +/- %.3f" % (acc_store.mean(), acc_store.std())
HTML('<div id="acc">%s</div>' % (st) )
For perspective, 40/42 is .952, so we average missing 2.
elisa_src0 = elisa[elisa['grp'] != 1]
elisa_src1 = elisa[elisa['grp'] != 0]
fig = plt.figure(figsize=(11,8), dpi=300)
training_labels = ["Complete Dataset", "Source 1 only", "Source 2 only"]
fig.subplots_adjust(hspace = 0)
fig.suptitle("Linear Discriminant Analysis Predictions", fontsize=18)
for et_i, elisa_train in enumerate([elisa, elisa_src0, elisa_src1]):
ax = fig.add_subplot(3,1, et_i + 1)
sp_title = "Trained on %s" % training_labels[et_i]
#ax.set_title()
src_vec = [i for i in elisa['grp']]
X = elisa_train[proteins].values
Y = np.array([0 if a else 1 for a in elisa_train['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
lda = LDA()
lda.fit(X, y, store_covariance=True)
X_complete = elisa[proteins].values
Y_complete = np.array([0 if a else 1 for a in elisa['DISEASE'] == 'Healthy'])
y_complete = np.transpose(Y_complete)
data_projected = lda.decision_function(X_complete)
y_pred = lda.predict(X_complete)
prob = lda.predict_proba(X_complete)
ax_store = []
label_store = []
healthy0_P = []
healthy1_P = []
gbm_P = []
mis_P = []
healthy0_proj = []
healthy1_proj = []
gbm_proj = []
mis_proj = []
labels = ['Healthy Source 1','Healthy Source 2', 'GBM', 'Misclassification']
colors = ['blue','purple', 'green','red']
import brewer2mpl
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[1], set1[2], 'red']
shapes = ['v','^','o', 'x']
for cls, pred, pr, v, src in zip(y_complete, y_pred, prob, data_projected, src_vec):
#print cls, v
v = v + random.random() *.1
if src == 0:
healthy0_P.append(0+pr[0])
healthy0_proj.append(v)
elif src == 1:
healthy1_P.append(0+pr[0])
healthy1_proj.append(v)
else:
gbm_P.append(0+pr[0])
gbm_proj.append(v)
if pred != cls:
mis_P.append(0+pr[0])
mis_proj.append(v)
for a,b,c,d,e in zip([healthy0_proj,healthy1_proj, gbm_proj, mis_proj],
[healthy0_P, healthy1_P,gbm_P, mis_P],
labels,
colors,
shapes):
ax_store.append(ax.scatter(a,b,label=c, color=d, marker=e))
label_store.append(c)
axsig = ax.axhline(y=.5,color='k', linestyle='--', alpha=.1)
ax_T = ax.axvline(x=0, color='k', linestyle='--', alpha=.1)
ax_store.append(axsig)
label_store.append('Decision Boundaries')
if et_i < 2:
for tl_i, label in enumerate(ax.xaxis.get_ticklabels()):
label.set_visible(False)
else:
ax.set_xlabel("LDA projection to 1D")
ax.set_ylabel("P(C=Healthy | x)")
ax.set_ylim([-.05,1.05])
ax.set_xlim([-25, 25])
ax.legend(tuple(ax_store ),tuple(label_store), 'upper right', fontsize='8', title=sp_title)
fig.savefig("figures/LDA-predictions-batch.pdf")
HTML('<div id="lpb">PDF in figures/LDA-predictions-batch.pdf</div>')
The above shows the actual predictions for the complete data set and for each when you leave one batch out.