Independent Confirmation of GBM Biomarker Accuracy

This script is designed to be run without modification after the arrival of an independent test set.

It

In [84]:
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
import brewer2mpl
from IPython.display import HTML, display
DEBUG=False

Import original data

In [85]:
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
    print "Original import"
    print elisa_raw.head()
elisa_raw =  elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
    print "Remove garbage columns"
    print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
    print "Give columns better names"
    print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add binary column denoting GBM/not GBM"
    print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
    try:
        label, idt = name.split('-')
    except ValueError as e:
        return 2
    if label == 'Healthy':
        try:
            int(idt)
            return 0
        except ValueError:
            return 1
    raise Exception("Should not get here")
labels = map(source, elisa.index) 
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add integer column denoting data source"
    print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']

Train LDA classifier

In [86]:
X = elisa[proteins].values
y = [x for x in elisa['GBM']]
if DEBUG:
    print X
    print y
clf = LDA()
clf.fit(X, y,  store_covariance=True)
Out[86]:
LDA(n_components=None, priors=None)

Display learned parameters of classifier

In [87]:
display(HTML('<div id="lpc">Result Summary</div>'))
print "Proteins and their order"
print proteins
print "Coefficients of the features in the linear decision function. rank is min(rank_features, n_classes) where rank_features is the dimensionality of the spaces spanned by the features (i.e. n_features excluding redundant features)."
print clf.coef_
print "Covariance matrix (shared by all classes)."
print clf.covariance_
print "Class means."
print clf.means_
print "Class priors (sum to 1)."
print clf.priors_
print "Scaling of the features in the space spanned by the class centroids."
print clf.scalings_
print "Overall mean."
print clf.xbar_
Result Summary
Proteins and their order
['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
Coefficients of the features in the linear decision function. rank is min(rank_features, n_classes) where rank_features is the dimensionality of the spaces spanned by the features (i.e. n_features excluding redundant features).
[[-1.83935771]
 [ 1.83935771]]
Covariance matrix (shared by all classes).
[[  2.51831197e+01   6.36927439e+02   4.46886725e+02   2.47132839e+01]
 [  6.36927439e+02   3.66546299e+05   2.33794129e+04   4.55612216e+03]
 [  4.46886725e+02   2.33794129e+04   4.46094547e+04   4.15873424e+03]
 [  2.47132839e+01   4.55612216e+03   4.15873424e+03   1.25500397e+03]]
Class means.
[[   10.70671021  2482.51420862   583.22041429   149.31081015]
 [   17.52224547   931.74141925   436.40557243    75.08792602]]
Class priors (sum to 1).
[ 0.5  0.5]
Scaling of the features in the space spanned by the class centroids.
[[ 0.12074336]
 [-0.00117826]
 [-0.00023884]
 [-0.01338536]]
Overall mean.
[   14.11447784  1707.12781393   509.81299336   112.19936808]

Import Independent Test Set

The test set should be of the form

In [88]:
ind_test_set = pandas.io.parsers.read_csv('data/ind-test.csv')

Predict classes

In [89]:
X_ind = ind_test_set[proteins]
y_ind_true = [1 if x[0] == 'G' else 0 for x in ind_test_set['SID']]
y_ind_pred =  clf.predict(X_ind)
y_ind_prob = clf.predict_proba(X_ind)
y_ind_proj = clf.decision_function(X_ind)

Result Summary

In [90]:
HTML('<div id="rsum">Result Summary</div>')
print classification_report(y_ind_true, y_ind_pred)
print "Accuracy", accuracy_score(y_ind_true, y_ind_pred)
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       1.00      1.00      1.00        21

avg / total       1.00      1.00      1.00        42

Accuracy 1.0

In [91]:
import brewer2mpl
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[2], 'red']
shapes = ['v','o', 'x']
labels = ["Healthy", "GBM", "Misclassification"]
fig = plt.figure(figsize=(11,8), dpi=300)
ax = fig.add_subplot(1,1,1)
healthy_prob = []
healthy_proj = []
gbm_prob = []
gbm_proj = []

mis_prob = []
mis_proj = []

for cls, pred, pr, v in zip( y_ind_true, y_ind_pred, y_ind_prob, y_ind_proj ):
    v = v+(random.random()*.1)
    if cls == 0:
        healthy_prob.append( pr[0] )
        healthy_proj.append( v )
    else:
        gbm_prob.append( pr[0] )
        gbm_proj.append( v )
    if pred != cls:
        mis_prob.append( pr[0] )
        mis_proj.append( v )
for a,b,c,d,e in zip([healthy_proj, gbm_proj, mis_proj],
               [healthy_prob,gbm_prob, mis_prob],
               labels, 
               colors,
               shapes):
    ax.scatter(a,b,label=c, color=d, marker=e)
axsig = ax.axhline(y=.5,color='k', linestyle='--', alpha=.1, label="Dec. Boundary")
ax_T = ax.axvline(x=0, color='k', linestyle='--', alpha=.1)

ax.set_xlabel("LDA projection to 1D")
ax.set_ylabel("P(C=Healthy | x)")
ax.set_ylim([-.05,1.05])
ax.set_xlim([-25, 25])
ax.legend(loc="upper right", fontsize=8, title='Independent Test Set Results' ,framealpha=.5)

fig.savefig("figures/LDA-sigmoid-ind-test-results.pdf")  
HTML('<div id="lpb">PDF in figures/LDA-sigmoid-ind-test-results.pdf</div>')
Out[91]:
PDF in figures/LDA-sigmoid-ind-test-results.pdf
In [92]:
fpr, tpr, thresholds = roc_curve(y_ind_true, y_ind_prob[:, 1])
roc_auc = auc(fpr, tpr)
fig = plt.figure(figsize=(11,8), dpi=300)
ax = fig.add_subplot(1,1,1)
ax.plot( fpr, tpr, lw=1, label="ROC ind. test(%0.2f)" % ( roc_auc ), color=colors[0]  )
    
ax.set_xlim([-0.05, 1.05])
ax.set_ylim([-0.05, 1.05])
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')

ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
ax.legend(loc="lower right", fontsize=8, title='Independent Test Set Results' ,framealpha=.5)
fig.savefig("figures/AUCROC-LDA-ind-test.pdf")
HTML('<div id="ROC">PDF in figures/AUCROC-LDA-ind-test.pdf</div>')
Out[92]:
PDF in figures/AUCROC-LDA-ind-test.pdf
In []: