In order to do TSP with these proteins it is necessary to do standardize each of the proteins.
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
from IPython.display import HTML, display
DEBUG=False
def strat_map(y):
"""
Returns permuted indices that maintain class
Used in cross validation
"""
smap = defaultdict(list)
for i,v in enumerate(y):
smap[v].append(i)
for values in smap.values():
random.shuffle(values)
y_map = np.zeros_like(y)
for i,v in enumerate(y):
y_map[i] = smap[v].pop()
return y_map
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
print "Original import"
print elisa_raw.head()
elisa_raw = elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
print "Remove garbage columns"
print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
print "Give columns better names"
print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add binary column denoting GBM/not GBM"
print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
try:
label, idt = name.split('-')
except ValueError as e:
return 2
if label == 'Healthy':
try:
int(idt)
return 0
except ValueError:
return 1
raise Exception("Should not get here")
labels = map(source, elisa.index)
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add integer column denoting data source"
print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
Y = np.array([0 if a else 1 for a in elisa['GBM'] == 0])
y = np.transpose(Y)
X = elisa[proteins].values
ss = sklearn.preprocessing.StandardScaler()
X_new = ss.fit_transform(X)
h_X = X_new[y == 0, :]
g_X = X_new[y == 1, :]
display(HTML('<div id="ct">'))
for a,b in combinations( range(4), 2 ):
gbm_a = (g_X[:,a] < g_X[:,b]).sum()
healthy_a = (h_X[:, a] < h_X[:, b]).sum()
gbm_b = (g_X[:, a] > g_X[:, b]).sum()
healthy_b = (h_X[:, a] > h_X[:, b]).sum()
tbl = [[gbm_a, gbm_b], [healthy_a, healthy_b]]
odds_ratio, p_value = stats.fisher_exact(tbl)
df = pandas.DataFrame(tbl, index=['GBM','Healthy'], columns=['%s < %s' % (proteins[a],proteins[b]) ,
'%s > %s' % (proteins[a],proteins[b]) ])
display(HTML('<div>' + df.to_html() + '</div>'))
display(HTML("<div>fisher exact test %.2E</div>" % p_value))
display(HTML('</div>'))
Y = np.array([0 if a else 1 for a in elisa['GBM'] == 0])
y = np.transpose(Y)
X = elisa[proteins].values
iterations = 100
a = 0
b = 1
DEBUG = False
if DEBUG:
print "Comparing %s and %s" % (proteins[a],proteins[b])
def innerTSP( X, y, p1, p2 ):
"""
p1 - index of first protein to compare
p2 - index of the second protein to compare
"""
ss = sklearn.preprocessing.StandardScaler()
X_new = ss.fit_transform(X)
h_X = X_new[y == 0, :]
g_X = X_new[y == 1, :]
a,b = p1,p2
gbm_a = (g_X[:,a] < g_X[:,b]).sum()
healthy_a = (h_X[:, a] < h_X[:, b]).sum()
gbm_b = (g_X[:, a] > g_X[:, b]).sum()
healthy_b = (h_X[:, a] > h_X[:, b]).sum()
tbl = [[gbm_a, gbm_b], [healthy_a, healthy_b]]
odds_ratio, p_value = stats.fisher_exact(tbl)
return p_value
pvals = []
def calcPrange(X, y, N, remove,a,b):
pvals = []
for _ in range(N):
sel = [random.randint(0, len(y)-1) for i in range(len(y)-remove)]
#sel = [i for i in range(len(y)) if i != rand]
#print sel
X_sub = X[sel, :]
y_sub = y[sel]
#print len(sel)
#print len(y_sub)
pvals.append(innerTSP(X_sub, y_sub, a,b))
res = np.array(pvals)
res.sort()
if DEBUG:
print(("Fisher Exact Test results for %s and %s at %i iterations, "
"randomly removing %i sample(s) with replacement.") % (
proteins[a], proteins[b], N, len(y) - len(sel)))
print("The average p-value %.2E +/- %.2E (95%% CI-analytical)" % (
res.mean(), 2* res.std()))
print("Empirical 95%% CI (%.2E, %.2E)" % (
res[int(len(res) * .025)], res[int(len(res) * .975)]))
return res
fig = plt.figure(figsize=(8,10), dpi=300)
n_tsp = len([a for a in combinations( range(4), 2 )])
ctr = 0
iterations = 1000
fig.subplots_adjust(hspace = .5, wspace=.4)
for a,b in combinations( range(4), 2 ):
ctr+=1
removed = []
N = iterations
pv = []
pmin = []
pmax = []
p975= []
p75 = []
p25 = []
p025= []
pvl = []
pminl = []
pmaxl = []
p975l= []
p75l = []
p25l = []
p025l= []
import brewer2mpl
set1 = brewer2mpl.get_map('YlOrRd', 'sequential', 6).mpl_colors
colors = set1[1:]
warnings.simplefilter("ignore")
pval_list = []
with warnings.catch_warnings():
#get annoying deprecation warning from scipy about pandas, ignore it
warnings.filterwarnings('ignore', category=DeprecationWarning)
for i in range(0, 15,2):
pval_list.append((i, calcPrange(X,y,iterations,i, a,b)))
ax = fig.add_subplot( n_tsp/2,2,ctr )
for p in pval_list:
removed.append(p[0])
pv.append(np.percentile(p[1], 50))
p025.append(np.percentile(p[1], 2.5))
p975.append(np.percentile(p[1], 97.5))
p25.append(np.percentile(p[1], 25))
p75.append(np.percentile(p[1], 75))
pmin.append(p[1].min())
pmax.append(p[1].max())
ax.set_title(('%s v %s') % (proteins[a], proteins[b]))
sig_level = .05
#plt.errorbar(removed,pv,yerr=[CIu, CIl],fmt=None, marker=None, mew=0)
base_size=100
ax50= ax.scatter(removed,pv, label="median", color=colors[2], marker='D', edgecolor='k')
ax025 = ax.scatter(removed,p025, label="2.5th perc.", color=colors[0],marker='^')
ax975 = ax.scatter(removed,p975, label="97.5th perc.", color=colors[4], marker='v')
ax25 = ax.scatter(removed,p25, label="25th perc.", color=colors[1], marker='s')
ax75 = ax.scatter(removed,p75, label="75th perc.", color=colors[3], marker='s')
axmin = ax.scatter(removed,pmin, label="min",color = 'black', marker='_')
axmax = ax.scatter(removed,pmax, label="max",color = 'black', marker='_')
axsig = ax.axhline(y=sig_level,color='red')
for_legend =(axmin, ax025, ax25, ax50, ax75, ax975, axmax, axsig)
ax.set_xlabel('k samples removed')
ax.set_ylabel('p-value (Fisher)')
ax.semilogy()
fig.legend(for_legend, ("min","2.5th perc.","25th perc.", "median", "75th perc.","97.5th perc.", "max", ".05 sig"),
fontsize='8', framealpha=.5, ncol=8, loc="lower right")
fig.suptitle("Robustness of pairwise protein comparisons\n Data sampled w/ replacement %i times" % (iterations), fontsize=18)
fig.savefig("figures/TSP-robustness2.pdf")
display(HTML('<div id="rob">PDF in figures/TSP-robustness2.pdf</div>'))