Classification

For the most part, I settled on Linear Discriminant Analysis because

Plots and results

In [1]:
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
import brewer2mpl
from IPython.display import HTML
DEBUG=False
In [2]:
def strat_map(y):
    """
    Returns permuted indices that maintain class
    Used in cross validation
    """
    smap = defaultdict(list)
    for i,v in enumerate(y):
        smap[v].append(i)
    for values in smap.values():
        random.shuffle(values)
    y_map = np.zeros_like(y)
    for i,v in enumerate(y):
        y_map[i] = smap[v].pop()
    return y_map
In [3]:
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
    print "Original import"
    print elisa_raw.head()
elisa_raw =  elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
    print "Remove garbage columns"
    print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
    print "Give columns better names"
    print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add binary column denoting GBM/not GBM"
    print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
    try:
        label, idt = name.split('-')
    except ValueError as e:
        return 2
    if label == 'Healthy':
        try:
            int(idt)
            return 0
        except ValueError:
            return 1
    raise Exception("Should not get here")
labels = map(source, elisa.index) 
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add integer column denoting data source"
    print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
In [4]:
set1 = brewer2mpl.get_map('Set3', 'qualitative', 12).mpl_colors
fig = plt.figure(figsize=(11,8))
fig.subplots_adjust(wspace = 0)
fig.subplots_adjust(hspace = 0)
fig.suptitle( 'AUCROC of k-fold cross validation\n using Linear Discriminant Analysis', fontsize=18, alpha=.9 )
fig.subplots_adjust(top=.9)
for nfolds in range(2,11):
    ax = fig.add_subplot(3,3, nfolds-1)
    clf = LDA()
    X = elisa[proteins].values
    Y = np.array([0 if a else 1 for a in  elisa['DISEASE'] == 'Healthy'])
    y = np.transpose(Y)
    sm = strat_map(y)
    
    ss = sklearn.preprocessing.StandardScaler()
    
    skf = StratifiedKFold(Y, nfolds)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    #fig = plt.figure(figsize=(4,4), dpi=72)
    
    for i, (train, test) in enumerate(skf):
        test,train = sm[test], sm[train] #shuffling
        probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        
        ax.plot(fpr, tpr, lw=1, label='f%d(%0.2f)' % (i+1, roc_auc), color=set1[i if i!=1 else 10])
    
    ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    
    mean_tpr /= len(skf)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    ax.plot(mean_fpr, mean_tpr, 'k--',
            label='Mean(%0.2f)' % mean_auc, lw=2)
    
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    if nfolds in [8,9,10]:
        ax.set_xlabel('FPR')
    else:
        for tl_i, label in enumerate(ax.xaxis.get_ticklabels()):
            label.set_visible(False)
    if nfolds in [2,5,8]:
        ax.set_ylabel('TPR')
    else:
        for tl_i, label in enumerate(ax.yaxis.get_ticklabels()):
            label.set_visible(False)
    if nfolds>6:
        ncols = 2
    else:
        ncols = 1
    ax.legend(loc="lower right", fontsize=8, ncol=ncols, title='%i-fold CV' % nfolds, framealpha=.5)
fig.savefig("figures/AUCROC-LDA.pdf")
HTML('<div id="ROC">PDF in figures/AUCROC-LDA.pdf</div>')
Out[4]:
PDF in figures/AUCROC-LDA.pdf

This is insanely good classification. The whole reason I am running so many difference CVs is because the results are so ridiculously good that the results need to be exhaustive.

In [5]:
roc_auc_store = []
acc_store = []
if DEBUG:
    N = 50 #if debugging lets not take an hour
else:
    N = 10000
nfolds = 10
for _ in range(N):
    clf = LDA()
    X = elisa[proteins].values
    Y = np.array([0 if a else 1 for a in  elisa['DISEASE'] == 'Healthy'])
    y = np.transpose(Y)
    sm = strat_map(y)
    
    skf = StratifiedKFold(Y, nfolds)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    #for storing predictions for accuracy
    y_pred_total = np.zeros_like(y)
    probas = np.zeros(len(y))
    for i, (train, test) in enumerate(skf):
        test,train = sm[test], sm[train] #shuffling, it is ridiculous that sklearn does not do this
        clf.fit(X[train], y[train])
        probas_ = clf.predict_proba(X[test])
        y_pred = clf.predict(X[test])
        y_pred_total[test] = clf.predict(X[test])
        #acc_store.append( accuracy_score(y[test] ,y_pred) )
        # Compute ROC curve and area the curve
        probas[test] = probas_[:, 1]
        #fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])

        #roc_auc_store.append(auc(fpr, tpr))
    fpr, tpr, thresholds = roc_curve(y, probas)
    roc_auc_store.append(auc(fpr, tpr))
    acc_store.append(accuracy_score(y, y_pred_total))
roc_auc_store = np.array(roc_auc_store)
acc_store = np.array(acc_store)

st = "After %i iterations of %i-fold cross validation using LDA" % (N, nfolds)
st += " the mean ROCAUC is %.4f +/- %.3f" % (roc_auc_store.mean(), roc_auc_store.std())
st += " and the mean accuracy is %.4f +/- %.3f" % (acc_store.mean(), acc_store.std())

HTML('<div id="acc">%s</div>' % (st) )
Out[5]:
After 10000 iterations of 10-fold cross validation using LDA the mean ROCAUC is 0.9955 +/- 0.003 and the mean accuracy is 0.9566 +/- 0.023

For perspective, 40/42 is .952, so we average missing 2.

In [6]:
elisa_src0 = elisa[elisa['grp'] != 1]
elisa_src1 = elisa[elisa['grp'] != 0]


fig = plt.figure(figsize=(11,8), dpi=300)
training_labels = ["Complete Dataset", "Source 1 only", "Source 2 only"]
fig.subplots_adjust(hspace = 0)
fig.suptitle("Linear Discriminant Analysis Predictions", fontsize=18)
for et_i, elisa_train in enumerate([elisa, elisa_src0, elisa_src1]):
    ax = fig.add_subplot(3,1, et_i + 1)
    sp_title = "Trained on %s" % training_labels[et_i]
    #ax.set_title()

    
    src_vec = [i for i in elisa['grp']]

    X = elisa_train[proteins].values
    Y = np.array([0 if a else 1 for a in  elisa_train['DISEASE'] == 'Healthy'])
    y = np.transpose(Y)
    lda = LDA()
    lda.fit(X, y, store_covariance=True)
    
    X_complete = elisa[proteins].values
    Y_complete = np.array([0 if a else 1 for a in  elisa['DISEASE'] == 'Healthy'])
    y_complete = np.transpose(Y_complete)
    data_projected = lda.decision_function(X_complete)
    
    
    y_pred =  lda.predict(X_complete)
    prob = lda.predict_proba(X_complete)
    ax_store = []
    label_store = []
    
    healthy0_P = []
    healthy1_P = []
    gbm_P = []
    mis_P = []
    
    healthy0_proj = []
    healthy1_proj = []
    gbm_proj = []
    mis_proj = []
    
    labels = ['Healthy Source 1','Healthy Source 2', 'GBM', 'Misclassification']
    colors = ['blue','purple', 'green','red']
    import brewer2mpl
    set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
    colors = [set1[0], set1[1], set1[2], 'red']
    shapes = ['v','^','o', 'x']
    
    
    for cls, pred, pr, v, src in zip(y_complete, y_pred, prob, data_projected, src_vec):
        #print cls, v
        v = v + random.random() *.1
        if src == 0:
            healthy0_P.append(0+pr[0])
            healthy0_proj.append(v)
        elif src == 1:
            healthy1_P.append(0+pr[0])
            healthy1_proj.append(v)
        else:
            gbm_P.append(0+pr[0])
            gbm_proj.append(v)
        if pred != cls:
            mis_P.append(0+pr[0])
            mis_proj.append(v)
    for a,b,c,d,e in zip([healthy0_proj,healthy1_proj, gbm_proj, mis_proj],
                       [healthy0_P, healthy1_P,gbm_P, mis_P],
                       labels, 
                       colors,
                       shapes):
        ax_store.append(ax.scatter(a,b,label=c, color=d, marker=e))
        label_store.append(c)
    axsig = ax.axhline(y=.5,color='k', linestyle='--', alpha=.1)
    ax_T = ax.axvline(x=0, color='k', linestyle='--', alpha=.1)
    
    ax_store.append(axsig)
    label_store.append('Decision Boundaries')
    if et_i < 2:
        for tl_i, label in enumerate(ax.xaxis.get_ticklabels()):
            label.set_visible(False)
    else:
        ax.set_xlabel("LDA projection to 1D")
    ax.set_ylabel("P(C=Healthy | x)")
    ax.set_ylim([-.05,1.05])
    ax.set_xlim([-25, 25])
    
    ax.legend(tuple(ax_store ),tuple(label_store), 'upper right', fontsize='8', title=sp_title)

fig.savefig("figures/LDA-predictions-batch.pdf")
HTML('<div id="lpb">PDF in figures/LDA-predictions-batch.pdf</div>')
Out[6]:
PDF in figures/LDA-predictions-batch.pdf

The above shows the actual predictions for the complete data set and for each when you leave one batch out.