This script is designed to be run without modification after the arrival of an independent test set.
It
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
import brewer2mpl
from IPython.display import HTML, display
DEBUG=False
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
print "Original import"
print elisa_raw.head()
elisa_raw = elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
print "Remove garbage columns"
print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
print "Give columns better names"
print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add binary column denoting GBM/not GBM"
print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
try:
label, idt = name.split('-')
except ValueError as e:
return 2
if label == 'Healthy':
try:
int(idt)
return 0
except ValueError:
return 1
raise Exception("Should not get here")
labels = map(source, elisa.index)
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add integer column denoting data source"
print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
X = elisa[proteins].values
y = [x for x in elisa['GBM']]
if DEBUG:
print X
print y
clf = LDA()
clf.fit(X, y, store_covariance=True)
display(HTML('<div id="lpc">Result Summary</div>'))
print "Proteins and their order"
print proteins
print "Coefficients of the features in the linear decision function. rank is min(rank_features, n_classes) where rank_features is the dimensionality of the spaces spanned by the features (i.e. n_features excluding redundant features)."
print clf.coef_
print "Covariance matrix (shared by all classes)."
print clf.covariance_
print "Class means."
print clf.means_
print "Class priors (sum to 1)."
print clf.priors_
print "Scaling of the features in the space spanned by the class centroids."
print clf.scalings_
print "Overall mean."
print clf.xbar_
The test set should be of the form
ind_test_set = pandas.io.parsers.read_csv('data/ind-test.csv')
X_ind = ind_test_set[proteins]
y_ind_true = [1 if x[0] == 'G' else 0 for x in ind_test_set['SID']]
y_ind_pred = clf.predict(X_ind)
y_ind_prob = clf.predict_proba(X_ind)
y_ind_proj = clf.decision_function(X_ind)
HTML('<div id="rsum">Result Summary</div>')
print classification_report(y_ind_true, y_ind_pred)
print "Accuracy", accuracy_score(y_ind_true, y_ind_pred)
import brewer2mpl
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[2], 'red']
shapes = ['v','o', 'x']
labels = ["Healthy", "GBM", "Misclassification"]
fig = plt.figure(figsize=(11,8), dpi=300)
ax = fig.add_subplot(1,1,1)
healthy_prob = []
healthy_proj = []
gbm_prob = []
gbm_proj = []
mis_prob = []
mis_proj = []
for cls, pred, pr, v in zip( y_ind_true, y_ind_pred, y_ind_prob, y_ind_proj ):
v = v+(random.random()*.1)
if cls == 0:
healthy_prob.append( pr[0] )
healthy_proj.append( v )
else:
gbm_prob.append( pr[0] )
gbm_proj.append( v )
if pred != cls:
mis_prob.append( pr[0] )
mis_proj.append( v )
for a,b,c,d,e in zip([healthy_proj, gbm_proj, mis_proj],
[healthy_prob,gbm_prob, mis_prob],
labels,
colors,
shapes):
ax.scatter(a,b,label=c, color=d, marker=e)
axsig = ax.axhline(y=.5,color='k', linestyle='--', alpha=.1, label="Dec. Boundary")
ax_T = ax.axvline(x=0, color='k', linestyle='--', alpha=.1)
ax.set_xlabel("LDA projection to 1D")
ax.set_ylabel("P(C=Healthy | x)")
ax.set_ylim([-.05,1.05])
ax.set_xlim([-25, 25])
ax.legend(loc="upper right", fontsize=8, title='Independent Test Set Results' ,framealpha=.5)
fig.savefig("figures/LDA-sigmoid-ind-test-results.pdf")
HTML('<div id="lpb">PDF in figures/LDA-sigmoid-ind-test-results.pdf</div>')
fpr, tpr, thresholds = roc_curve(y_ind_true, y_ind_prob[:, 1])
roc_auc = auc(fpr, tpr)
fig = plt.figure(figsize=(11,8), dpi=300)
ax = fig.add_subplot(1,1,1)
ax.plot( fpr, tpr, lw=1, label="ROC ind. test(%0.2f)" % ( roc_auc ), color=colors[0] )
ax.set_xlim([-0.05, 1.05])
ax.set_ylim([-0.05, 1.05])
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
ax.legend(loc="lower right", fontsize=8, title='Independent Test Set Results' ,framealpha=.5)
fig.savefig("figures/AUCROC-LDA-ind-test.pdf")
HTML('<div id="ROC">PDF in figures/AUCROC-LDA-ind-test.pdf</div>')