"""D5C3 scoring function
Original matlab code from Gustavo A. Stolovitzky, Robert Prill, Ph.D.
sub challenge B original code in R from A. de la Fuente
"""
import os
import random
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np
from dreamtools.core.rocs import D3D4ROC
__all__ = ["D5C3"]
[docs]class D5C3(Challenge, D3D4ROC):
"""A class dedicated to D5C3 challenge
::
from dreamtools import D5C3
s = D5C3()
filename = s.download_template()
s.score(filename)
Data and templates are downloaded from Synapse. You must have a login.
3 subchallenges (A100, A300, A999) but also 3 others simpler with
B1, B2, B3
For A series, 5 networks are required. For B, 3 are needed.
"""
def __init__(self, verbose=True, download=True, **kargs):
""".. rubric:: constructor
"""
super(D5C3, self).__init__('D5C3', verbose, download, **kargs)
self._init()
self.sub_challenges = ['A100', 'A300', 'A999', 'B']
self.N_pvalues = 100
def _init(self):
# should download files from synapse if required.
# The templates
if self._standalone is True:
return
prefix = 'DREAM5_SysGen'
self._download_data(prefix + 'A100_myteam_Network1.txt', 'syn4561478')
self._download_data(prefix + 'A100_myteam_Network2.txt', 'syn4561482')
self._download_data(prefix + 'A100_myteam_Network3.txt', 'syn4561486')
self._download_data(prefix + 'A100_myteam_Network4.txt', 'syn4561489')
self._download_data(prefix + 'A100_myteam_Network5.txt', 'syn4561493')
# Get goldstandard and unpack zipped files
self._download_data('D5C3_goldstandard.zip', 'syn4561554')
self.unzip('D5C3_goldstandard.zip')
# the probabilities for sub challenge A
self._download_data('D5C3_probabilities.zip', 'syn4562041')
self.unzip('D5C3_probabilities.zip')
def _check_filename(self, filename):
end = filename[-5:]
if end[1] != '.':
raise ValueError("File must end with a suffix of 3 letters e.g. .csv")
end = end[0]
if end not in ['1', '2', '3', '4', '5']:
raise ValueError("File must end with a valid batch value in 1,2,3,4,5")
return end
def _check_subname(self, subname):
if subname not in self.sub_challenges:
raise ValueError("sub-challenge must be in %s" % self.sub_challenges)
def _check_sub_challenge_name(self, name):
assert name in self.sub_challenges
[docs] def download_template(self, subname):
# for B1, B2, B3, returns a single file
# for A100, A300, A999 as well but to indicate the network,
# append _1, _2, ..._5
# if not, the network _1 is returned only
# there is no A300 or a999 template
if subname in ['A300', 'A999']:
print("Warning:: there is no A300 or A999 template per se. " +
"Use the goldstandard instead")
subname = 'A100'
self._check_subname(subname)
if subname == 'B':
filenames = [self._pj([self.classpath, 'templates',
"DREAM5_SysGenB1_your_Predictions.txt" ])]
filenames.append(self._pj([self.classpath, 'templates',
"DREAM5_SysGenB2_your_Predictions.txt" ]))
filenames.append(self._pj([self.classpath, 'templates',
"DREAM5_SysGenB3_your_Predictions.txt" ]))
else:
filenames = []
for i in range(1, 6):
gs_filename = self.get_pathname('DREAM5_SysGen%s_myteam_Network%s.txt' % (subname, i))
filenames.append(gs_filename)
return filenames
[docs] def download_goldstandard(self, subname):
# for B1, B2, B3, returns a single file
# for A100, A300, A999 as well but to indicate the network, append _1, _2, ..._5
# if not, the network _1 is returned only
self._check_subname(subname)
if subname == 'B':
filenames = [self._pj([self.classpath, 'goldstandard',
"DREAM5_SysGenB1_TestPhenotypeData.txt" ])]
filenames.append(self._pj([self.classpath, 'goldstandard',
"DREAM5_SysGenB2_TestPhenotypeData.txt"]))
filenames.append(self._pj([self.classpath, 'goldstandard',
"DREAM5_SysGenB3_TestPhenotypeData.txt"]))
return filenames
else:
filenames = []
for i in range(1, 6):
gs_filename = self.get_pathname('DREAM5_SysGen%s_Edges_Network%s.tsv' % (subname, i))
filenames.append(gs_filename)
return filenames
def _load_network(self, filename):
df = pd.read_csv(filename, header=None, sep='[ \t]', engine='python')
df[0] = df[0].apply(lambda x: x.replace('g', '').replace('G', ''))
df[1] = df[1].apply(lambda x: x.replace('g', '').replace('G', ''))
df = df.astype(float) # imoprtant for later to check for equality
return df
[docs] def score(self, filename, subname):
self._check_subname(subname)
if subname in ['A100', 'A300', 'A999']:
return self._score_challengeA_bunch(filename, subname)
elif subname == 'B':
if isinstance(filename, list) is False:
raise TypeError('Challenge B expects 3 input files B1, B2, B3')
if len(filename)!=3:
raise TypeError('Challenge B expects 3 input files B1, B2, B3')
return self.score_challengeB(filename)
else:
raise ValueError('Sub challenge must be either A or B')
def _score_challengeA_bunch(self, filenames, subname):
from easydev import Progress
pb = Progress(5, 1)
pb.animate(0)
results = []
for i, filename in enumerate(filenames):
res = self.score_challengeA(filename, subname+"_" + str(i+1))
pb.animate(i+1)
results.append(res)
aupr_score = -np.mean(np.log10([x['p_auroc'] for x in results]))
auroc_score = -np.mean(np.log10([x['p_aupr'] for x in results]))
score = (aupr_score + auroc_score)/2.
df = pd.Series()
df['Overall Score'] = score
df['AUPR score (pval)'] = aupr_score
df['AUROC score (pval)'] = aupr_score
for i in range(1, 6):
df['AUPR Net %s' % i] = results[i-1]['aupr']
for i in range(1, 6):
df['AUROC Net %s' % i] = results[i-1]['auroc']
return df
[docs] def score_challengeA(self, filename, subname):
name1, name2 = subname.rsplit("_",1)
goldfile = self.download_goldstandard(name1)[int(name2)-1]
# gold standard edges only
predictionfile = filename
# precomputed probability densities for various metrics
pdffile_aupr = self.get_pathname(name1 + os.sep+ 'Network%s_AUPR.mat' % (name2))
pdffile_auroc = self.get_pathname(name1+os.sep+ 'Network%s_AUROC.mat'% (name2))
# load probability densities
pdf_aupr = self.loadmat(pdffile_aupr)
pdf_auroc = self.loadmat(pdffile_auroc)
self.pdf_auroc = self.loadmat(pdffile_auroc)
self.pdf_aupr = self.loadmat(pdffile_aupr)
# load gold standard
self.gold_edges = self._load_network(goldfile)
# load predictions
self.prediction = self._load_network(predictionfile)
# DISCOVERY
# In principle we could resuse ROCDiscovery class but
# here the pvalues were also computed. let us do it here for now
merged = pd.merge(self.gold_edges, self.prediction,
how='inner', on=[0,1])
self.merged = merged
TPF = len(merged)
# unique species should be 1000
N = len(set(self.gold_edges[0]).union(self.gold_edges[1]))
# positive
Pos = len(self.gold_edges)
# negative
Neg = N*N-N-Pos
# total
Ntot = Pos + Neg
L = len(self.prediction)
discovery = np.zeros(L)
values_gs = [tuple(x) for x in merged[[0,1]].values]
values_pred = [tuple(x) for x in self.prediction[[0,1]].values]
count = 0
for i in range(0, L):
if values_pred[i] in values_gs:
discovery[count] = 1
# else nothing to do (vector is filled with zeros
count += 1
TPL = sum(discovery)
self.discovery = discovery
if L < Ntot:
p = (Pos - TPL) / float(Ntot - L)
else:
p = 0
random_positive_discovery = [p] * (Ntot - L)
random_negative_discovery = [1-p] * (Ntot - L)
# append discovery + random using lists
positive_discovery = np.array(list(discovery)
+ random_positive_discovery)
negative_discovery = np.array(list(1-discovery)
+ random_negative_discovery)
# true positives (false positives) at depth k
TPk = np.cumsum(positive_discovery)
FPk = np.cumsum(negative_discovery)
# metrics
TPR = TPk / float(Pos)
FPR = FPk / float(Neg)
REC = TPR # same thing
PREC = TPk / range(1, Ntot+1)
from dreamtools.core.rocs import ROCBase
roc = ROCBase()
auroc = roc.compute_auc(roc={'tpr':TPR, 'fpr':FPR})
aupr = roc.compute_aupr(roc={'precision':PREC, 'recall':REC})
# normalise by max possible value
aupr /= (1.-1./Pos)
p_aupr = self._probability(pdf_aupr['X'][0], pdf_aupr['Y'][0], aupr)
p_auroc = self._probability(pdf_auroc['X'][0], pdf_auroc['Y'][0],
auroc)
results = {'auroc':auroc, 'aupr':aupr, 'p_auroc':p_auroc,
'p_aupr':p_aupr}
return results
[docs] def score_challengeB(self, filenames):
# Ideally provide 3 filenames but if only 1 is given, try
# to infer the names of the 2 others
cor_pheno1 = []
cor_pheno2 = []
pval_pheno1 = []
pval_pheno2 = []
scores = []
from dreamtools.core.rtools import RTools
rtool = RTools(verboseR=False)
assert len(filenames) == 3, "Must provide 3 files"
self.golds = []
self.preds = []
gold_filenames = self.download_goldstandard('B')
print("Warning: your 3 submissions should be ordered as B1, B2, B3 files")
for tag in [1, 2, 3]:
#assumeing data and gs are sorted in the same way !!
gold = pd.read_csv(gold_filenames[tag-1], sep='[ \t]',
engine='python')
self.golds.append(gold)
#filename = 'DREAM5_SysGenB%s_your_Predictions.txt' % tag
#filename = self._pj([self.classpath, 'data', filename])
filename = filenames[tag-1]
pred1 = pd.read_csv(filename, sep='[ \t]', engine='python')
self.preds.append(pred1)
# correlation gs versus predicted
rtool.session.t = pred1.ix[0].values
rtool.session.g = gold.ix[0].values
rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
T1 = rtool.session.results.copy()
rtool.session.t = pred1.ix[1].values
rtool.session.g = gold.ix[1].values
rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
T2 = rtool.session.results.copy()
cor_pheno1.append(T1['estimate'])
cor_pheno2.append(T2['estimate'])
pval_pheno1.append(T1['p.value'])
pval_pheno2.append(T2['p.value'])
score = -(np.log(T1['p.value']) + np.log(T2['p.value']))
scores.append(score)
self.corp1 = cor_pheno1
self.corp2 = cor_pheno2
self.pval1 = pval_pheno1
self.pval2 = pval_pheno2
self.scores = scores
# This part now compute the pvalues using random prediction
random_scores = {0:[],1:[],2:[]}
from easydev import Progress
pb = Progress(self.N_pvalues, interval=1)
for ii in range(1, self.N_pvalues):
for tag in [0,1,2]:
#generate random coordinates
coord = random.sample(['RIL%s' % i for i in range(1,31)], 30)
coord2 = random.sample(['RIL%s' % i for i in range(1,31)], 30)
# Obtaining random scores
rtool.session.t = self.preds[tag].ix[0].ix[coord].values
rtool.session.g = self.golds[tag].ix[0].values
rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
T1 = rtool.session.results.copy()
rtool.session.t = self.preds[tag].ix[1].ix[coord2].values
rtool.session.g = self.golds[tag].ix[1].values
rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
T2 = rtool.session.results.copy()
random_scores[tag].append(-(np.log(T1['p.value']) + np.log(T2['p.value'])))
pb.animate(ii+1)
self.random_scores = random_scores
#Obtaining p-values
pvals = [sum(self.random_scores[k]>= self.scores[k])/float(self.N_pvalues)
for k in [0,1,2]]
self.pvals = pvals
df = pd.DataFrame({'scores':self.scores,
'correlation_phenotype1':cor_pheno1,
'correlation_phenotype2':cor_pheno2,
'pvalues_phenotype1':pval_pheno1,
'pvalues_phenotype2':pval_pheno2,
'pvalues':self.pvals})
df= df.T
df.columns = ['SysGenB1', 'SysGenB2', 'SysGenB3']
return df
def _probability(self, X, Y, x):
dx = X[1] - X[0]
return sum(Y[X >= x]) * dx