Source code for dreamtools.dream4.D4C3.scoring

"""D4C3 scoring function

Based on Matlab script available on
https://www.synapse.org/#!Synapse:syn2825304, which is an
original code from  Gustavo A. Stolovitzky and Robert Prill.
"""
import os
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np
import pylab


[docs]class D4C3(Challenge): """A class dedicated to D4C3 challenge :: from dreamtools import D4C3 s = D4C3() filename = s.download_template() s.edge_count = 20 s.score(filename) Data and templates are inside Dreamtools. .. note:: A parameter called cost_per_link is hardcoded for the challenge. It was compute as min {Prediction Score / Edge Count} amongst all submissions. For this scoring function, :attr:`cost_per_link` is set to 0.0827 and may be changed by the user. """ def __init__(self, verbose=True, download=True, edge_count=None, cost_per_link=0.0827, **kargs): """.. rubric:: constructor :param int edge_count: if not provided, a prompt will ask for its value. :param float cost_per_link: a cost """ super(D4C3, self).__init__('D4C3', verbose, download, **kargs) # cost_per_link Was determined empirically from all the teams' submissions # as r = min(prediction_score / edge_count). self.cost_per_link = cost_per_link self.edge_count = edge_count self.species = ['AKT', 'ERK12', 'Ikb', 'JNK12', 'p38', 'HSP27', 'MEK12'] self._load_gold_standard() self._fetch_normalisation()
[docs] def download_goldstandard(self): filename = self._pj([self.classpath, 'goldstandard', 'D4C3_goldstandard.csv']) return filename
def _load_gold_standard(self): filename = self.download_goldstandard() df = pd.read_csv(filename) df.replace('NOT AVAILABLE', np.nan, inplace=True) self.goldstandard = df.copy() pass
[docs] def download_template(self): filename = self._pj([self.classpath, 'templates', 'D4C3_templates.csv']) return filename
def _load_prediction(self, filename): df = pd.read_csv(filename) df.replace('NOT AVAILABLE', np.nan, inplace=True) self.prediction = df.copy() pass
[docs] def score(self, filename): """Compute the score See synapse page for details about the scoring function. """ self._load_prediction(filename) # compute error and pval for each molecular species # some columns have NA that were originally string (NOT AVAILABLE) so # those columns are not float type. We need to cast them to float. # Besides, we care only about species and time 30 T = self.prediction[self.species][self.prediction.Time==30].astype(float) G = self.goldstandard[self.species][self.goldstandard.Time==30].astype(float) self.errors = {} self.x_norm_log = {} self.y_log = {} self.pvals = {} for i, species in enumerate(self.species): m, b, rho = self._fit_line(species) t = T[species] g = G[species] # zero counting correction prevents log(0) x_log = pylab.log10(t + 1.) y_log = pylab.log10(g + 1.) # normalise x_norm_log = m * x_log + b # transform back to original space x_norm_lin = 10 ** x_norm_log y_lin = 10 ** y_log # the same prediction score from DREAM3 numerator = (y_lin - x_norm_lin) ** 2 # cancels +1 correction denominator = 300**2 + (0.08*g) ** 2 error_individual = numerator / denominator # add the individual scores self.errors[species] = error_individual.sum() # load prob density function import scipy.io filename = self._pj([self.classpath, 'data', 'pdf_score_%s.mat' % str(i+1)]) proba = scipy.io.loadmat(filename) X, Y, C = proba['X'][0], proba['Y'][0], proba['C'][0] # save some information self.x_norm_log[species] = x_norm_log.copy() self.y_log[species] = y_log.copy() self.pvals[species] = self._probability(X, Y, self.errors[species]) # cast to list is required in py3 self.prediction_score = -pylab.mean(pylab.log10( list(self.pvals.values()))) if self.edge_count is None: try: input = raw_input except NameError: pass edge_count = int(input("Please enter an edge count values\n" + "It should be number of edges in your network (20 for the template):")) assert edge_count >= 0 else: edge_count = self.edge_count self.overall_score = self.prediction_score - self.cost_per_link * edge_count df = pd.DataFrame() df['Edge_count'] = [edge_count] df['Overall_score'] = [self.overall_score] df['Prediction_score'] = [self.prediction_score] for species in self.species: df[species+'_pvalue'] = [self.pvals[species]] return df
#return a final data dataframe
[docs] def plot(self): """Plots prediction versus gold standard for each species .. plot:: :include-source: :width: 80% from dreamtools import D4C3 s = D4C3() filename = s.download_template() s.edge_count = 20 s.score(filename) s.plot() """ if hasattr(self, 'x_norm_log') is False: print('Call score() method first. Nothing to plot.') return pylab.clf() for i, species in enumerate(self.species): pylab.subplot(4, 2, i+1) pylab.plot(self.x_norm_log[species], self.y_log[species], '.') pylab.grid(True) #pylab.axis('equal') Xlim = np.array(pylab.xlim()) Xlim[1] = max([Xlim[1], self.y_log[species].max()]) Xlim[0]-= 0.5 Xlim[1]+=0.5 pylab.plot(Xlim, Xlim, 'k--') X = self.x_norm_log[species] mask = X.isnull() == False N = mask.sum() Y = self.y_log[species] b, m = np.linalg.lstsq(np.vstack([np.ones(N), X[mask]]).T, Y[mask])[0] ax = pylab.plot(Xlim, np.array(Xlim*m+b), 'g-') pylab.xlim(Xlim) pylab.ylim(Xlim) pylab.title(species) pylab.subplot(4,2,8) pylab.text(.05,.5, '- Prediction (y-axis) versus gold \n' +\ " standard (x-axis).\n" +\ '- Dashed lines shows the y=x best fit') pylab.axis('off') pylab.tight_layout()
def _probability(self, X, Y, x): dx = X[1]-X[0] P = sum(Y[X <= x])*dx return P def _fetch_normalisation(self): # x is training # y is gold filename = self._pj([self.classpath, 'data', 'common_training.csv']) training = pd.read_csv(filename) #filename = self._pj([self.classpath, 'data', 'common_gold.csv']) self.download_goldstandard() goldfile = pd.read_csv(filename) #"""function [m b rho] = fetch_normalization(jj) #%% jj is the index of the molecule (1-7) colnames = self.species self.norm_training = pylab.log10(training[colnames] + 1) self.norm_gold = pylab.log10(goldfile[colnames] + 1) #which one is x/y? #[m b rho] = fit_line(x,y); def _fit_line(self, species): x = self.norm_training[species].dropna().values y = self.norm_gold[species].dropna().values N = len(x) # equivalent to X/y in matlab [solution, residuals, ranks, s] = np.linalg.lstsq(np.vstack([np.ones(N), x]).T, y) b,m = solution rho = np.corrcoef(x,y)[1,0] return m, b, rho