Classification with TSP (kinda)

In order to do TSP with these proteins it is necessary to do standardize each of the proteins.

In [1]:
#Imports used
%matplotlib inline
import pandas
import pandas.io.parsers
from sklearn.lda import LDA
from sklearn.qda import QDA
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pandas.tools.plotting import bootstrap_plot
from matplotlib.mlab import PCA
import matplotlib.pyplot as plt
import sklearn.feature_selection
from itertools import combinations
from scipy import stats
import pylab as P
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import warnings
import random
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm
import brewer2mpl
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from collections import defaultdict
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import accuracy_score
from IPython.display import HTML, display
DEBUG=False
In [2]:
def strat_map(y):
    """
    Returns permuted indices that maintain class
    Used in cross validation
    """
    smap = defaultdict(list)
    for i,v in enumerate(y):
        smap[v].append(i)
    for values in smap.values():
        random.shuffle(values)
    y_map = np.zeros_like(y)
    for i,v in enumerate(y):
        y_map[i] = smap[v].pop()
    return y_map
In [3]:
elisa_raw = pandas.io.parsers.read_csv('data/ELISA.csv')
#last 2 columns have garbage in them
if DEBUG:
    print "Original import"
    print elisa_raw.head()
elisa_raw =  elisa_raw[elisa_raw.columns[:-2]]
if DEBUG:
    print "Remove garbage columns"
    print elisa_raw.head()
elisa_raw.columns = ['name','HMOX1', 'TGFBI', 'VCAM1', 'CD44', 'AGE', 'GNDR', 'DISEASE']
if DEBUG:
    print "Give columns better names"
    print elisa_raw.head()
elisa_raw.set_index('name', inplace=True)
elisa = elisa_raw
#add binary healthy/not healthy vector
labels = map(lambda x: 0 if x[0] == 'H' else 1, elisa.index)
elisa['GBM'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add binary column denoting GBM/not GBM"
    print elisa['GBM'].head()
#Add data source as integer vector
def source( name ):
    try:
        label, idt = name.split('-')
    except ValueError as e:
        return 2
    if label == 'Healthy':
        try:
            int(idt)
            return 0
        except ValueError:
            return 1
    raise Exception("Should not get here")
labels = map(source, elisa.index) 
elisa['grp'] = pandas.Series(labels, index=elisa.index)
if DEBUG:
    print "Add integer column denoting data source"
    print elisa.head()
proteins = ['HMOX1', 'TGFBI', 'VCAM1', 'CD44']
In [4]:
Y = np.array([0 if a else 1 for a in  elisa['GBM'] == 0])
y = np.transpose(Y)
X = elisa[proteins].values
ss = sklearn.preprocessing.StandardScaler()
X_new = ss.fit_transform(X)
h_X = X_new[y == 0, :]
g_X = X_new[y == 1, :]
display(HTML('<div id="ct">'))
for a,b in combinations( range(4), 2  ):
    gbm_a =  (g_X[:,a] < g_X[:,b]).sum()
    healthy_a = (h_X[:, a] < h_X[:, b]).sum()
    gbm_b = (g_X[:, a] > g_X[:, b]).sum()
    healthy_b = (h_X[:, a] > h_X[:, b]).sum()
    tbl = [[gbm_a, gbm_b], [healthy_a, healthy_b]]
    odds_ratio, p_value = stats.fisher_exact(tbl)
    df =  pandas.DataFrame(tbl, index=['GBM','Healthy'], columns=['%s < %s' % (proteins[a],proteins[b]) , 
                                                                  '%s > %s' % (proteins[a],proteins[b]) ])
    display(HTML('<div>' + df.to_html() + '</div>'))
    display(HTML("<div>fisher exact test %.2E</div>" % p_value))
display(HTML('</div>'))
HMOX1 < TGFBI HMOX1 > TGFBI
GBM 3 18
Healthy 20 1
fisher exact test 1.26E-07
HMOX1 < VCAM1 HMOX1 > VCAM1
GBM 4 17
Healthy 18 3
fisher exact test 3.21E-05
HMOX1 < CD44 HMOX1 > CD44
GBM 3 18
Healthy 18 3
fisher exact test 6.74E-06
TGFBI < VCAM1 TGFBI > VCAM1
GBM 14 7
Healthy 5 16
fisher exact test 1.22E-02
TGFBI < CD44 TGFBI > CD44
GBM 10 11
Healthy 8 13
fisher exact test 7.56E-01
VCAM1 < CD44 VCAM1 > CD44
GBM 6 15
Healthy 14 7
fisher exact test 2.94E-02
In [5]:
Y = np.array([0 if a else 1 for a in  elisa['GBM'] == 0])
y = np.transpose(Y)
X = elisa[proteins].values
iterations = 100
a = 0
b = 1
DEBUG = False
if DEBUG:
    print "Comparing %s and %s" % (proteins[a],proteins[b])

def innerTSP( X, y, p1, p2 ):
    """
    p1 - index of first protein to compare
    p2 - index of the second protein to compare
    """
    ss = sklearn.preprocessing.StandardScaler()
    X_new = ss.fit_transform(X)
    h_X = X_new[y == 0, :]
    g_X = X_new[y == 1, :]
    a,b = p1,p2
    gbm_a =  (g_X[:,a] < g_X[:,b]).sum()
    healthy_a = (h_X[:, a] < h_X[:, b]).sum()
    gbm_b = (g_X[:, a] > g_X[:, b]).sum()
    healthy_b = (h_X[:, a] > h_X[:, b]).sum()
    tbl = [[gbm_a, gbm_b], [healthy_a, healthy_b]]
    odds_ratio, p_value = stats.fisher_exact(tbl)
    return p_value

pvals = []
def calcPrange(X, y, N, remove,a,b): 
    pvals = []
    for _ in range(N):
        sel = [random.randint(0, len(y)-1) for i in range(len(y)-remove)]
        #sel = [i for i in range(len(y)) if i != rand]
    
        #print sel
        X_sub = X[sel, :]
        y_sub = y[sel]
        #print len(sel)
        #print len(y_sub)
        pvals.append(innerTSP(X_sub, y_sub, a,b))
    res = np.array(pvals)
    res.sort()
    if DEBUG:
        print(("Fisher Exact Test results for %s and %s at %i iterations, "
           "randomly removing %i sample(s) with replacement.") % ( 
            proteins[a], proteins[b], N, len(y) - len(sel))) 
        print("The average p-value %.2E +/- %.2E (95%% CI-analytical)" % (
            res.mean(), 2* res.std()))
        print("Empirical 95%% CI (%.2E, %.2E)" % (
            res[int(len(res) * .025)], res[int(len(res) * .975)]))
    return res
In [6]:
fig = plt.figure(figsize=(8,10), dpi=300)
n_tsp = len([a for a in combinations( range(4), 2  )])
ctr = 0
iterations = 1000
fig.subplots_adjust(hspace = .5, wspace=.4)
for a,b in combinations( range(4), 2  ):
    ctr+=1
    removed = []
    N = iterations
    pv = []
    pmin = []
    pmax = []
    p975= []
    p75 = []
    p25 = []
    p025= []
    
    pvl = []
    pminl = []
    pmaxl = []
    p975l= []
    p75l = []
    p25l = []
    p025l= []
    import brewer2mpl
    set1 = brewer2mpl.get_map('YlOrRd', 'sequential', 6).mpl_colors
    colors = set1[1:]
    warnings.simplefilter("ignore")
    pval_list = []
    with warnings.catch_warnings():
        #get annoying deprecation warning from scipy about pandas, ignore it
        warnings.filterwarnings('ignore', category=DeprecationWarning)
        for i in range(0, 15,2):
            pval_list.append((i, calcPrange(X,y,iterations,i, a,b)))
    
    ax = fig.add_subplot( n_tsp/2,2,ctr )
    for p in pval_list:
        removed.append(p[0])
        pv.append(np.percentile(p[1], 50))
        p025.append(np.percentile(p[1], 2.5))
        p975.append(np.percentile(p[1], 97.5))
        p25.append(np.percentile(p[1], 25))
        p75.append(np.percentile(p[1], 75))
        pmin.append(p[1].min())
        pmax.append(p[1].max())
    
    ax.set_title(('%s v %s') % (proteins[a], proteins[b]))
    sig_level = .05
    #plt.errorbar(removed,pv,yerr=[CIu, CIl],fmt=None, marker=None, mew=0)
    base_size=100
    
    ax50= ax.scatter(removed,pv, label="median", color=colors[2], marker='D', edgecolor='k')
    ax025 = ax.scatter(removed,p025, label="2.5th perc.", color=colors[0],marker='^')
    ax975 = ax.scatter(removed,p975, label="97.5th perc.", color=colors[4], marker='v')
    ax25 =  ax.scatter(removed,p25, label="25th perc.", color=colors[1], marker='s')
    ax75 =  ax.scatter(removed,p75, label="75th perc.", color=colors[3], marker='s')
    axmin = ax.scatter(removed,pmin, label="min",color = 'black', marker='_')
    axmax = ax.scatter(removed,pmax, label="max",color = 'black', marker='_')
    axsig = ax.axhline(y=sig_level,color='red')
    for_legend =(axmin, ax025, ax25, ax50, ax75, ax975, axmax, axsig) 
    
    ax.set_xlabel('k samples removed')
    ax.set_ylabel('p-value (Fisher)')
    ax.semilogy()
fig.legend(for_legend, ("min","2.5th perc.","25th perc.", "median", "75th perc.","97.5th perc.", "max", ".05 sig"),
              fontsize='8', framealpha=.5, ncol=8, loc="lower right")
fig.suptitle("Robustness of pairwise protein comparisons\n Data sampled w/ replacement %i times" % (iterations), fontsize=18)

fig.savefig("figures/TSP-robustness2.pdf")
display(HTML('<div id="rob">PDF in figures/TSP-robustness2.pdf</div>'))
PDF in figures/TSP-robustness2.pdf
In [6]: