This page details the raw distributions of the data.
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from IPython.display import HTML
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg
DEBUG=False
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
print "Original import"
print elisa_raw.head()
elisa_raw = elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
print "Remove garbage columns"
print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
print "Give columns better names"
print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add binary column denoting GBM/not GBM"
print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
try:
label, idt = name.split('-')
except ValueError as e:
return 2
if label == 'Healthy':
try:
int(idt)
return 0
except ValueError:
return 1
raise Exception("Should not get here")
labels = map(source, elisa.index)
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add integer column denoting data source"
print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
X = elisa[proteins].values
Y = np.array([0 if a else 1 for a in elisa['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
F_statistics, pvalues = sklearn.feature_selection.f_classif(X,Y)
fstat = pandas.DataFrame( np.array([F_statistics, pvalues]), columns=proteins,
index = ['F statistic', 'P-values'])
fig = plt.figure(figsize=(10,8), dpi=300)
fig.suptitle( 'Individual Protein Summaries', fontsize=18, alpha=.9 )
fig.subplots_adjust(top=.9)
# build a rectangle in axes coords
left, width = .25, .5
bottom, height = .25, .5
right = left + width
top = bottom + height
for p_i, prot in enumerate(proteins):
ax = fig.add_subplot(2,len(proteins),p_i +1)
bp = ax.boxplot( [elisa[prot][ elisa['DISEASE'] == 'Healthy' ], elisa[prot][ elisa['DISEASE'] == 'GBM' ]] )
ax.set_title( prot, fontsize=12, alpha=.7 )
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
if p_i == 0:
ax.set_ylabel('ng/ml', fontsize=10, alpha = .5)
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticklabels(['Healthy', 'GBM'], fontsize=10, alpha=.5)
ax.spines['left']._linewidth = 0.5
ax.spines['left']._alpha = 0.8
for label in ax.yaxis.get_ticklabels():
# label is a Text instance
label.set_alpha(.5)
label.set_rotation(45)
label.set_fontsize(8)
set1 = brewer2mpl.get_map('Set1', 'qualitative', 7).mpl_colors
plt.setp(bp['boxes'], color=set1[1], linewidth=0.3, alpha=.5)
plt.setp(bp['medians'], color=set1[0])
plt.setp(bp['whiskers'], color=set1[1], linestyle='solid', linewidth=0.5)
plt.setp(bp['fliers'], color=set1[1], marker='d', alpha=.5)
plt.setp(bp['caps'], color='w')
h_elisa = elisa[elisa['DISEASE'] == "Healthy"]
g_elisa = elisa[elisa['DISEASE'] == "GBM"]
#fig = plt.figure(figsize=(10,4), dpi=72, tight_layout=True)
#fig.subplots_adjust(hspace = 0)
#fig.subplots_adjust(right = 1)
fig.subplots_adjust(wspace = 0)
ctr = 0
for p_i, prot in enumerate(proteins):
ax = fig.add_subplot(2, len(proteins), len(proteins) + p_i + 1)
n, bins, patches = ax.hist( [h_elisa[prot], g_elisa[prot]], histtype='bar', bins=7, label=['Healthy', 'GBM'], alpha=.5)
#print dir( patches )
#ax.set_title(prot, fontsize=12, alpha=.7)
if p_i == 0:
ax.set_ylabel('count', fontsize=10, alpha = .5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#ax.spines['bottom'].set_visible(False)
ax.xaxis.set_ticks_position('none')
ax.spines['left']._linewidth = 0.5
ax.spines['left']._alpha = 0.8
ax.spines['bottom']._linewidth = 0.5
ax.spines['bottom']._alpha = 0.8
f = fstat[prot]['F statistic']
p = fstat[prot]['P-values']
if p_i > 1:
l_shift = .4
rot = 0
else:
l_shift = .05
rot = 0
our_title= "F-stat - %.1f" % ( f, )
ax.text(l_shift*(left+right), 0.92*(bottom+top), our_title,
horizontalalignment='left',
verticalalignment='center',
fontsize=10,
alpha=.5,
transform=ax.transAxes, rotation=rot)
our_title= "p-val - %.1E" % ( p,)
if p_i > 1:
l_shift = .4
rot = 0
else:
l_shift = .05
rot = 0
ax.text(l_shift*(left+right), 0.84*(bottom+top), our_title,
horizontalalignment='left',
verticalalignment='center',
fontsize=10,
alpha=.5,
transform=ax.transAxes,
rotation=rot)
if p_i != 0:
ax.yaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.set_ylim((0,10))
last_i = len(ax.xaxis.get_ticklabels()) -1
for i, label in enumerate(ax.xaxis.get_ticklabels()):
# label is a Text instance
if 0 < i < last_i:
label.set_alpha(.5)
else:
label.set_alpha(0.0)
label.set_rotation(45)
label.set_fontsize(8)
for label in ax.yaxis.get_ticklabels():
# label is a Text instance
if p_i == 0:
label.set_alpha(.5)
label.set_fontsize(8)
else:
label.set_visible(False)
ctr+=1
handles, labels = ax.get_legend_handles_labels()
leg = fig.legend(handles, labels, bbox_to_anchor=(.57, .45), framealpha=.9)
for label in leg.get_texts():
label.set_fontsize(12)
label.set_alpha(.8)
fig.savefig("figures/individual-protein-summaries.pdf")
HTML('<div id="ips">PDF in figures/individual-protein-summaries.pdf</div>')
This graph shows the distributions of the proteins grouped by phenotype. There are 'significant' differences for all proteins(ANOVA), although the strength of those differences is highly variable and after a bonferroni correction, VCAM1 would no longer be significant at an alpha of .05(~.0125).
clean_columns = proteins # + ['AGE']
N = len(clean_columns)
fig = plt.figure(figsize=(22,22), dpi=300)
fig.subplots_adjust(top=.9)
fig.subplots_adjust(wspace=0, hspace=0)
show_ax = [(0,i) for i in range(0, N)]
show_ax += [(i,N-1) for i in range(0, N)]
h1_elisa = elisa[ elisa['grp'] == 0 ]
h1_elisa = h1_elisa[clean_columns]
h2_elisa = elisa[ elisa['grp'] == 1 ]
h2_elisa = h2_elisa[clean_columns]
g_elisa = elisa[ elisa['grp'] == 2 ]
g_elisa = g_elisa[clean_columns]
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[1], set1[2], 'red']
shapes = ['v','^','o', 'x']
for i,a in enumerate(clean_columns):
for j,b in enumerate(clean_columns):
ax = fig.add_subplot(N,N,i*N+j+1)
"""
if i != j:
form = "%s ~ %s" % ( a,b )
result = sm.ols(formula=form, data=elisa ).fit()
prstd, iv_l, iv_u = wls_prediction_std(result)
ax.plot(elisa[b], result.fittedvalues, 'k-', alpha=.2)
ax.plot(elisa[b], iv_u, 'k--', alpha=.1)
ax.plot(elisa[b], iv_l, 'k--', alpha=.1)"""
p_healthy1 = ax.scatter(h1_elisa[b], h1_elisa[a], color=colors[0], marker=shapes[0], alpha=.9)
p_healthy2 = ax.scatter(h2_elisa[b], h2_elisa[a], color=colors[1], marker=shapes[1],alpha=.9)
p_gbm = ax.scatter(g_elisa[b],g_elisa[a], color=colors[2], marker=shapes[2], alpha=.9)
if i==0:
ax.set_title(clean_columns[j],size='12')
ax.set_ylabel(a,size='12')
ticks = [tick for tick in ax.get_xaxis().get_major_ticks()]
ticks[0].label.set_visible(False)
ticks[-1].label.set_visible(False)
ticks = [tick for tick in ax.get_yaxis().get_major_ticks()]
ticks[0].label.set_visible(False)
ticks[-1].label.set_visible(False)
for tick in ax.get_xaxis().get_major_ticks():
tick.label.set_fontsize(10)
tick.label.set_rotation(45)
for tick in ax.get_yaxis().get_major_ticks():
tick.label.set_fontsize(10)
tick.label.set_rotation(45)
if (j,i) not in show_ax:
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
else:
if (j,i) not in show_ax[:len(show_ax)/2]:
ax.get_yaxis().set_visible(False)
if (j,i) not in show_ax[len(show_ax)/2:]:
ax.get_xaxis().set_visible(False)
fig.legend((p_healthy1, p_healthy2, p_gbm), ('Healthy Src 1','Healthy Src 2','GBM'), bbox_to_anchor=(.90, .90), fontsize='12')
fig.suptitle('Bivariate Comparisons', fontsize=20)
fig.savefig("figures/bivariate-comparisons.pdf")
HTML('<div id="rbc">PDF in figures/bivariate-comparisons.pdf</div>')
This plot shows the additional power of combining the signals of the proteins. In the 2-D case, several planes of separation are apparent, as well as some interesting correlative relationships, i.e, strong pos correlation between CD44 and TGFBI.
factor_groups = elisa.groupby(['grp'])
fig = plt.figure( figsize=(12,4), dpi=300 )
##TODO
for p_i, prot in enumerate(proteins):
ax = fig.add_subplot(1,4, p_i + 1)
data = []
labels = []
for src, group in factor_groups:
ax.set_title("%s" % prot, alpha=.5, fontsize=10 )
if src == 0:
label = "Hlthy\n Src 1"
if src == 1:
label = "Hlthy\n Src 2"
if src == 2:
label = "GBM"
labels.append(label)
data.append(elisa[prot][group[prot].index])
bp = ax.boxplot(data)
plt.xticks(range(1,len(labels) +1 ), labels, rotation=-35, alpha=.5, fontsize=10)
#ax.set_ylabel('Protein level %s' % prot)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
if p_i == 0:
ax.set_ylabel('ng/ml', fontsize=10, alpha = .5)
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
for label in ax.yaxis.get_ticklabels():
# label is a Text instance
label.set_alpha(.5)
label.set_rotation(45)
label.set_fontsize(10)
set1 = brewer2mpl.get_map('Set1', 'qualitative', 7).mpl_colors
plt.setp(bp['boxes'], color=set1[1], linewidth=0.3, alpha=.5)
plt.setp(bp['medians'], color=set1[0])
plt.setp(bp['whiskers'], color=set1[1], linestyle='solid', linewidth=0.5)
plt.setp(bp['fliers'], color=set1[1], marker='d', alpha=.5)
plt.setp(bp['caps'], color='w')
fig.subplots_adjust(top=.8, wspace=.7, hspace=.3 )
fig.suptitle("Batch-based Protein Summaries", fontsize=14)
ignore = fig.savefig('figures/batch-box-plot.pdf', bbox_inches='tight')
HTML('<div id="bbp">PDF in figures/batch-box-plot.pdf</div>')
This plot shows some of the batch based effects that we are seeing. In general, these are not particularly strong batch effects, but they certainly exist, particularly in HMOX.
fig = plt.figure( figsize=(10.5,8), dpi=300 )
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[1], set1[2], 'red']
for p_i, prot in enumerate(proteins):
ax = fig.add_subplot(2,2, p_i + 1)
for src, group in factor_groups:
ax.set_title( prot )
if src == 0:
label = "Healthy\nSrc %i" % (src +1)
if src == 1:
label = "Healthy\nSrc %i" % (src + 1)
if src == 2:
label = "GBM"
ax.scatter(group['AGE'], group[prot], color=colors[src], label=label)
form = "%s ~ AGE" % ( prot )
result = sm.ols(formula=form, data=elisa ).fit()
prstd, iv_l, iv_u = wls_prediction_std(result)
ax.plot(elisa['AGE'], result.fittedvalues, 'k-', alpha=.5, label=form)
ax.plot(elisa['AGE'], iv_u, 'k--', alpha=.1)
ax.plot(elisa['AGE'], iv_l, 'k--', alpha=.1)
prstd, iv_l, iv_u = wls_prediction_std(result)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_xlabel('Age')
ax.set_ylabel('Protein level')
fig.subplots_adjust( wspace=.2, hspace=.3 )
fig.suptitle('Age based regression')
labels = ['Prot ~ AGE'] + labels[1:]
fig.legend( handles, labels, loc='lower center', framealpha=.5, ncol=4)
fig.savefig('figures/proteins-by-age.pdf', bbox_inches='tight')
HTML('<div id="pba">PDF in figures/proteins-by-age.pdf</div>')
The above plot shows the protein expression by age. This is much better now that our samples are more closely matched. I also think this is a nice way to clearly see the actual data points individually.
clean_columns = proteins # + ['AGE']
ss = sklearn.preprocessing.StandardScaler()
elisa[ proteins ] = ss.fit_transform(elisa[ proteins ].values)
N = len(clean_columns)
fig = plt.figure(figsize=(22,22), dpi=300)
fig.subplots_adjust(top=.9)
fig.subplots_adjust(wspace=0, hspace=0)
show_ax = [(0,i) for i in range(0, N)]
show_ax += [(i,N-1) for i in range(0, N)]
h1_elisa = elisa[ elisa['grp'] == 0 ]
h1_elisa = h1_elisa[clean_columns]
h2_elisa = elisa[ elisa['grp'] == 1 ]
h2_elisa = h2_elisa[clean_columns]
g_elisa = elisa[ elisa['grp'] == 2 ]
g_elisa = g_elisa[clean_columns]
set1 = brewer2mpl.get_map('Paired', 'qualitative', 12).mpl_colors
colors = [set1[0], set1[1], set1[2], 'red']
shapes = ['v','^','o', 'x']
tsp_div = np.linspace(-2,2, num=20)
for i,a in enumerate(clean_columns):
for j,b in enumerate(clean_columns):
ax = fig.add_subplot(N,N,i*N+j+1)
"""
if i != j:
form = "%s ~ %s" % ( a,b )
result = sm.ols(formula=form, data=elisa ).fit()
prstd, iv_l, iv_u = wls_prediction_std(result)
ax.plot(elisa[b], result.fittedvalues, 'k-', alpha=.2)
ax.plot(elisa[b], iv_u, 'k--', alpha=.1)
ax.plot(elisa[b], iv_l,)"""
ax.plot( tsp_div, tsp_div, 'k--', alpha=.1)
p_healthy1 = ax.scatter(h1_elisa[b], h1_elisa[a], color=colors[0], marker=shapes[0], alpha=.9)
p_healthy2 = ax.scatter(h2_elisa[b], h2_elisa[a], color=colors[1], marker=shapes[1],alpha=.9)
p_gbm = ax.scatter(g_elisa[b],g_elisa[a], color=colors[2], marker=shapes[2], alpha=.9)
if i==0:
ax.set_title(clean_columns[j],size='12')
ax.set_ylabel(a,size='12')
ticks = [tick for tick in ax.get_xaxis().get_major_ticks()]
ticks[0].label.set_visible(False)
ticks[-1].label.set_visible(False)
ticks = [tick for tick in ax.get_yaxis().get_major_ticks()]
ticks[0].label.set_visible(False)
ticks[-1].label.set_visible(False)
for tick in ax.get_xaxis().get_major_ticks():
tick.label.set_fontsize(10)
tick.label.set_rotation(45)
for tick in ax.get_yaxis().get_major_ticks():
tick.label.set_fontsize(10)
tick.label.set_rotation(45)
if (j,i) not in show_ax:
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
else:
if (j,i) not in show_ax[:len(show_ax)/2]:
ax.get_yaxis().set_visible(False)
if (j,i) not in show_ax[len(show_ax)/2:]:
ax.get_xaxis().set_visible(False)
fig.legend((p_healthy1, p_healthy2, p_gbm), ('Healthy Src 1','Healthy Src 2','GBM'), bbox_to_anchor=(.90, .90), fontsize='12')
fig.suptitle('Normalized Bivariate Comparisons', fontsize=20)
fig.savefig("figures/normalized-bivariate-comparisons.pdf")
HTML('<div id="nbc">PDF in figures/normalized-bivariate-comparisons.pdf</div>')
The above shows the bivariate comparisons after scaling each proteins expression to std normal. It should be noted that this is happening implicitly or explicity in most of the analysis(i.e. fstat, LDA implicitly, TSP, log reg explicitly).
The diagonal is also the TSP boundary, FWIW.