#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from IPython.display import HTML
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg
import math
import statsmodels.stats.power as pwr
DEBUG=False
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
print "Original import"
print elisa_raw.head()
elisa_raw = elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
print "Remove garbage columns"
print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
print "Give columns better names"
print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add binary column denoting GBM/not GBM"
print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
try:
label, idt = name.split('-')
except ValueError as e:
return 2
if label == 'Healthy':
try:
int(idt)
return 0
except ValueError:
return 1
raise Exception("Should not get here")
labels = map(source, elisa.index)
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
print "Add integer column denoting data source"
print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
GBM = elisa[ ['HMOX1', 'CD44','VCAM1', 'TGFBI'] ][ elisa['GBM'] == 1]
if DEBUG:
GBM
Healthy = elisa[ ['HMOX1', 'CD44','VCAM1', 'TGFBI'] ][ elisa['GBM'] == 0]
if DEBUG:
Healthy
X = elisa[proteins].values
Y = np.array([0 if a else 1 for a in elisa['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
F_statistics, pvalues = sklearn.feature_selection.f_classif(X,Y)
fstat = pandas.DataFrame( np.array([F_statistics, pvalues]), columns=proteins,
index = ['F statistic', 'P-values'])
Healthy_mean = Healthy.mean()
GBM_mean = GBM.mean()
Healthy_std = Healthy.std()
GBM_std = GBM.std()
h_n = Healthy["HMOX1"].count()
g_n = GBM["HMOX1"].count()
Healthy_mean
GBM_mean.index
def pooled_s(std1,std2, n1, n2):
return math.sqrt( ( (n1 - 1)* math.pow(std1,2) + (n2 - 1)* math.pow(std2,2))/(n1 + n2 - 2) )
Cohens_d = GBM_mean - Healthy_mean
for prot in Cohens_d.index:
Cohens_d[prot] = Cohens_d[prot] / pooled_s( GBM_std[prot], Healthy_std[prot], g_n, h_n )
if DEBUG:
print Cohens_d
colormap = plt.cm.Dark2
plt_alpha = 1 #0.7
lw = 2
fig = plt.figure(figsize=(8.5,11))
fig.subplots_adjust( wspace=.17, hspace=.3 )
for pri, prot in enumerate(Cohens_d.index):
ax = fig.add_subplot(2,2,pri + 1)
alphas = np.logspace(-1, -10, num=10, endpoint=True, base=10.0)
f = fstat[prot]['F statistic']
p = fstat[prot]['P-values']
alphas[0] = .05
colors = [colormap(i) for i in np.linspace(0, 0.9, len(alphas))]
for ii, alpha in enumerate(alphas):
nobs_list = []
power = []
labels = []
for nobs in range(3,40):
nobs_list.append( nobs )
power.append(pwr.FTestAnovaPower().power(effect_size = abs(Cohens_d[prot]), nobs=nobs, alpha=alpha))
ax.plot(nobs_list, power, label="%4.0E" % (alpha), alpha=plt_alpha, color=colors[ii],)
alpha = fstat[prot]['P-values']
nobs_list = []
power = []
labels = []
for nobs in range(3,40):
nobs_list.append( nobs )
power.append(pwr.FTestAnovaPower().power(effect_size = abs(Cohens_d[prot]), nobs=nobs, alpha=alpha))
ax.plot(nobs_list, power, label="%.2E\n (current)" % (alpha), alpha=plt_alpha, color='red', ls='--')
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='lower right', title='P-value', framealpha=.9, ncol=3)
ax.set_title('%s (effect size %4.2F)' % ( prot, Cohens_d[prot] ) )
ax.set_ylabel('Power') #(Prob. of correctly\n rejecting a false null)')
ax.set_xlabel('Number of paired observations')
fig.legend(handles, labels[:-1] + ['current'], loc='lower right', title='P-value', framealpha=.9, ncol=6)
fig.savefig('figures/power-analysis.pdf' )
plt.show()