Rec'd validation data July 23, 2014.
Data munged July 24, 2014.
The original source file is data/ELISA_Training_Validation_Combined.xslx (md5:13fe02727dee37d3ba8e69103b01d8a1)
This was modified to be convertible to csv in data/ELISA_Validation_PREP.xlsx (md5:6f3a3c7e3eb4c5eff7ac2f826185774e)
Which was then exported to csv as data/ELISA_Validation_PREP.csv (md5:8aca344297b5eea7d9d8ad7b6e0ebed8)
import hashlib
import numpy as np
import pandas.io.parsers
def md5_for_file(f, block_size=2**20):
"""
See: http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python
"""
md5 = hashlib.md5()
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
return md5.hexdigest()
with open('data/ELISA_Training_Validation_Combined.xlsx', 'rb') as orig:
print "ELISA_Training_Validation_Combined.xlsx md5(%s)" % md5_for_file( orig )
with open('data/ELISA_Validation_PREP.xlsx', 'rb') as mung1:
print "ELISA_Validation_PREP.xlsx md5(%s)" % md5_for_file( mung1 )
with open('data/ELISA_Validation_PREP.csv', 'rb') as mung2:
print "ELISA_Validation_PREP.csv md5(%s)" % md5_for_file( mung2 )
Now the data needs to be put into the format the validation script expects.
PREP_val = pandas.io.parsers.read_csv('data/ELISA_Validation_PREP.csv')
PREP_val = PREP_val.set_index('Sample')
PREP_val
From 05-Independent Confirmation
Data needs to be
SID = []
Sample_SID_map = []
for samp_name in list(PREP_val.index):
parsed = samp_name.split('-')
if 'Cont' in parsed:
SID.append('H%s' % parsed[-1] )
else:
SID.append('G%s' % parsed[-1] )
Sample_SID_map.append( ( samp_name, SID[-1] ) )
col_names = ['SID', 'HMOX1', 'TGFBI', 'VCAM1', 'CD44']
index = PREP_val.loc
ind_test = pandas.DataFrame(np.zeros((len(PREP_val.index), len(col_names))),columns=col_names)
ind_test['SID'] = SID
ind_test = ind_test.set_index('SID')
proteins = col_names[1:]
for protein in proteins:
#name mismatch
if protein == 'VCAM1':
prot = 'VCAM'
else:
prot = protein
PREP_mean = PREP_val[['%s-Replicate-1' % prot,'%s-Replicate-2' % prot]].sum(axis=1)/2
for prep_samp, ind_samp in Sample_SID_map:
ind_test.loc[ind_samp, protein] = PREP_mean[prep_samp]
ind_test
Above table is at data/ind-test.csv
ind_test.to_csv('data/ind-test.csv')