"""
Original code for challenge B translted from Mukesh Bansal
Sub challenge A is currently a wrapping of a perl code provided by Jim Costello
"""
import os
import math
import random
from dreamtools.core.challenge import Challenge
from easydev import shellcmd
import pandas as pd
import numpy as np
[docs]class D7C4(Challenge):
"""A class dedicated to D7C4 challenge
::
from dreamtools import D7C4
s = D7C4()
filename = s.download_template()
s.score(filename)
Data and templates are downloaded from Synapse. You must have a login.
::
# columns represent the probabilistic c-index of the given team for
each drug.
# following the columns of teams are 5 columns which are used for
calculating the overall team score
# |-> Test_data = the probabilistic c-index for the experimentally
determined test data scored against itself
# |-> Mean Null Distribution = a set of 10,000 random predictions
were scored to create the null distribution, of which this column
represents the mean
# |-> SD Null Distribution = a set of 10,000 random predictions
were scored to create the null distribution, of which this column
represents the standard deviation
# |-> z-score of test data to null = score of the test data minus
the mean of the null distribution divided by the standard deviation
of the null distribution
# |-> weight of drug (normalized z-score) = the z-score normalized
by the largest z-score across all 31 drugs.
# to calculate your team overall score, simply mulitple the score
of all drugs by the corresponding weight. Divide the sum of these
weighted scores by the sum of the weights
"""
def __init__(self, verbose=True, download=True, **kargs):
""".. rubric:: constructor
This challenge uses PERL script that requires specific packages.
First, you need cpanm tools (http://search.cpan.org/dist/App-cpanminus/)
Under Fedora 23:
sudo dnf install perl-App-cpanminus
Then, install the dependencies that will be required
sudo cpanm install Math::Libm
sudo cpanm install Algorithm::Pair::Best2
sudo cpanm install Digest::SHA1
sudo cpanm install Tk
sudo cpanm install Games::Go::AGATourn
finally install the Games-go-GoPair.tar.gz package stored in dreamtools
github repositotry in dreamtools/dreamt7/D7C4/misc::
cd dreamtools/dream7/D7C4/misc
tar xvfz Games-Go-GoPair-1.001.tar.gz
cd Games-Go-GoPair-1.001
perl Makefile.PL
make
sudo make install
"""
super(D7C4, self).__init__('D7C4', verbose, download, **kargs)
self.sub_challenges = ['A', 'B']
def _check_subname(self, subname):
from easydev import check_param_in_list
check_param_in_list(subname, self.sub_challenges)
[docs] def download_template(self, subname):
self._check_subname(subname)
if subname == 'A':
filename = self.getpath_template('D7C4_template.csv')
elif subname == 'B':
filename = self.getpath_template('D7C4_template_B.csv')
return filename
[docs] def score(self, filename, subname):
self._check_subname(subname)
if subname == 'A':
return self.score_A(filename)
elif subname == 'B':
return self.score_B(filename)
[docs] def download_goldstandard(self, subname):
self._check_subname(subname)
if subname == 'A':
filename = self._pj([self.classpath, 'templates',
'D7C4_template.csv'])
elif subname == 'B':
filename = self.getpath_gs('D7C4_B_synergy_IC20.tsv')
return filename
[docs] def score_A(self, filename):
from easydev import TempFile
fh = TempFile()
script = self._pj([self.classpath,
'weighted_average_concordance_index.pl'])
datadir = self._pj([self.classpath, 'data'])
cmd = "perl %s %s %s %s"
cmd = cmd % (script, filename, datadir , fh.name)
shellcmd(cmd, verbose=True, ignore_errors=True)
try:
df = pd.read_csv(fh.name, sep='\t', header=None)
except:
print("Something wrong in the Scoring while executing \n %s. " % cmd)
print("\n The D7C4 challenge requires a Perl package to be installed")
print("See D7C4 documentation e.g., on dreamtools.readthedocs.org")
import sys
sys.exit(1)
df.columns = ['DrugID', 'probabilistic c-index',
'weighted probabilistic c-index', 'zscores']
df = df.set_index('DrugID')
fh.delete()
ws = (df.sum() / df.sum().ix['zscores'])
ws = ws.ix['weighted probabilistic c-index']
results = df.mean()
results['weight average probabilistic c-index'] = ws
del results['zscores']
# Finally compute pvalues based on precomputed scores
precomp = pd.read_csv(self._pj([self.classpath, 'data',
'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t',
skiprows=6, header=None)
overall_mean = precomp.ix[31][1]
overall_var = precomp.ix[31][2]
pval = 1 - (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1))
results['weight average probabilistic c-index p-value'] = pval
return {'Results': results}
[docs] def score_B(self, filename):
gs_filename = self.download_goldstandard('B')
gold = pd.read_csv(gs_filename, sep='\t')
gold.columns = [x.strip() for x in gold.columns]
unique_drugs = list(set(gold['Cmpd A']))
# build a new columns
pairs = gold['Cmpd A'] + "_" + gold['Cmpd B']
gold['pairs'] = pairs
# sort by excess of over Bliss
try:
gold = gold.sort_values(by=['Excess over Bliss'])
except:
gold = gold.sort(columns=['Excess over Bliss'])
self.p_matrix = self._probability_matrix(gold['Excess over Bliss'], gold['SEM'])
# noew read the predicition. Note that here it has to be comma separated
# There are 91 rows + 1 . THe lsat gives the
# The 93rd row (i.e., the last row after the header line and the 91 pairs) should report the
# first compound pair in the ranked list whose predicted activity is not deemed to have a
# significant synergistic effect, i.e. excess over Bliss close to 0 (see Scoring Metrics section
# for the definition of excess over Bliss).
prediction = pd.read_csv(filename, sep=',')
# we rename first column to agree with gold for a later merge
prediction.columns = ['pairs', 'Rank']
#skip last row
prediction = prediction.ix[0:90]
#In the submission, drugs already within a single column where drug A and B are seprated by a & sign.
# let us replace the & by a _ like in the gold standard
newnames = prediction['pairs'].apply(lambda x : "_".join([y.strip() for y in x.split("&")]))
newnames = list(newnames)
prediction['pairs'] = newnames
# not that the fold is sorted so it should be first
self.merged = pd.merge(gold, prediction, how='inner', on=['pairs'])
N = 91
ranks = self.merged['Rank'].values.astype(float)
weighted_cindex = self._concordance(ranks, range(0,N), self.p_matrix)
Nmax = 10000
cindex_nulldist =np.zeros(Nmax)
for i in range(0, Nmax):
# cast to list for py3
randx = list(range(N))
random.shuffle(randx) # in place
cindex_nulldist[i] = self._concordance(randx, range(0,N), self.p_matrix)
pvalues = sum(cindex_nulldist>= weighted_cindex)/float(Nmax)
results = pd.Series()
results['weighted cindex'] = weighted_cindex
results['pvalue'] = pvalues
return {'Results': results}
def _probability_matrix(self, x, x_std):
from scipy.special import erf
N = len(x)
X = np.repeat(np.array(x),N).reshape(N,N)
X = X - X.transpose()
X_std = np.repeat(np.array(x_std),N).reshape(N,N)
X_std = np.sqrt(X_std**2+X_std.transpose()**2)
p_matrix = 0.5*(1 + erf(X/X_std))
return p_matrix
def _concordance(self, x, y, p_matrix):
N = len(x)
X = np.repeat(np.array(x),N).reshape(N, N)
Y = np.repeat(np.array(y),N).reshape(N, N)
C = np.sign(X - X.transpose()) == np.sign(Y - Y.transpose())
C = C * (1 - p_matrix.transpose()) + (1-C) * p_matrix.transpose()
C = sum(sum(np.tril(C, -1))) / float(N) / (N - 1.) * 2
return C