In [1]:
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from IPython.display import HTML
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from IPython.display import display_pretty, display_html, display_jpeg, display_png, display_json, display_latex, display_svg
import math
import statsmodels.stats.power as pwr

DEBUG=False
In [2]:
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
    print "Original import"
    print elisa_raw.head()
elisa_raw =  elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
    print "Remove garbage columns"
    print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
    print "Give columns better names"
    print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add binary column denoting GBM/not GBM"
    print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
    try:
        label, idt = name.split('-')
    except ValueError as e:
        return 2
    if label == 'Healthy':
        try:
            int(idt)
            return 0
        except ValueError:
            return 1
    raise Exception("Should not get here")
labels = map(source, elisa.index) 
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add integer column denoting data source"
    print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
In [3]:
GBM = elisa[ ['HMOX1', 'CD44','VCAM1', 'TGFBI'] ][ elisa['GBM'] == 1]
if DEBUG:
    GBM
In [4]:
Healthy = elisa[ ['HMOX1', 'CD44','VCAM1', 'TGFBI'] ][ elisa['GBM'] == 0]
if DEBUG:
    Healthy
    
In [5]:
X = elisa[proteins].values
Y = np.array([0 if a else 1 for a in  elisa['DISEASE'] == 'Healthy'])
y = np.transpose(Y)
F_statistics, pvalues = sklearn.feature_selection.f_classif(X,Y)
fstat = pandas.DataFrame( np.array([F_statistics, pvalues]), columns=proteins,
                      index = ['F statistic', 'P-values'])
In [6]:
Healthy_mean = Healthy.mean()
GBM_mean = GBM.mean()
Healthy_std = Healthy.std()
GBM_std = GBM.std()
h_n = Healthy["HMOX1"].count()
g_n = GBM["HMOX1"].count()
Healthy_mean
GBM_mean.index
def pooled_s(std1,std2, n1, n2):
    return math.sqrt( ( (n1 - 1)* math.pow(std1,2) + (n2 - 1)* math.pow(std2,2))/(n1 + n2 - 2)   )
Cohens_d = GBM_mean - Healthy_mean 
for prot in Cohens_d.index:
    Cohens_d[prot] = Cohens_d[prot] / pooled_s( GBM_std[prot], Healthy_std[prot], g_n, h_n )
if DEBUG:
    print Cohens_d
In [7]:
colormap = plt.cm.Dark2
plt_alpha = 1 #0.7
lw = 2
fig = plt.figure(figsize=(8.5,11))
fig.subplots_adjust( wspace=.17, hspace=.3 )
for pri, prot in enumerate(Cohens_d.index):
    ax = fig.add_subplot(2,2,pri + 1)
    alphas = np.logspace(-1, -10, num=10, endpoint=True, base=10.0)
    f = fstat[prot]['F statistic']
    p = fstat[prot]['P-values']
    alphas[0] = .05

    colors = [colormap(i) for i in np.linspace(0, 0.9, len(alphas))]
    for ii, alpha in enumerate(alphas):
        nobs_list = []
        power = []
        labels  = []
        for nobs in range(3,40):
            
            nobs_list.append( nobs )
            power.append(pwr.FTestAnovaPower().power(effect_size = abs(Cohens_d[prot]), nobs=nobs, alpha=alpha))
        ax.plot(nobs_list, power, label="%4.0E" % (alpha), alpha=plt_alpha, color=colors[ii],)
    alpha = fstat[prot]['P-values']
    nobs_list = []
    power = []
    labels  = []
    for nobs in range(3,40):    
        nobs_list.append( nobs )
        power.append(pwr.FTestAnovaPower().power(effect_size = abs(Cohens_d[prot]), nobs=nobs, alpha=alpha))
    ax.plot(nobs_list, power, label="%.2E\n (current)" % (alpha), alpha=plt_alpha, color='red', ls='--')

    handles, labels = ax.get_legend_handles_labels()
    #ax.legend(handles, labels, loc='lower right', title='P-value', framealpha=.9, ncol=3)
    ax.set_title('%s (effect size %4.2F)' % ( prot, Cohens_d[prot] ) )
    ax.set_ylabel('Power') #(Prob. of correctly\n rejecting a false null)')
    ax.set_xlabel('Number of paired observations')
fig.legend(handles, labels[:-1] + ['current'], loc='lower right', title='P-value', framealpha=.9, ncol=6)
fig.savefig('figures/power-analysis.pdf' )

plt.show()
In [7]: