Source code for dreamtools.dream7.D7C1.scoring

# -*- python -*-
#
#  This file is part of DREAMTools software
#
#  Copyright (c) 2014-2015 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: http://github.org/dreamtools
#
##############################################################################
"""DREAM7 Challenge 1 (Parameter estimation and network topology prediction)

:References:
    * http://dreamchallenges.org/project-list/dream7-2012/
    * https://www.synapse.org/#!Synapse:syn2821735/wiki/

:Publications: http://www.biomedcentral.com/1752-0509/8/13/abstract

"""
import os
import glob

import pandas as pd
import numpy as np

from dreamtools.core.challenge import Challenge


__all__ = ['D7C1']


[docs]class D7C1(Challenge): """DREAM 7 - Network Topology and Parameter Inference Challenge Here is a quick example on calling the scoring methods:: from dreamtools import D7C1 s = D7C1() s.score_model1_timecourse(filename) s.score_model1_parameters(filename) s.score_topology(filename) This class provides 3 main scoring functions: #. :meth:`score_topology` #. :meth:`score_model1_timecourse` #. :meth:`score_model1_parameters` Each takes as an input a valid submission as described in the official `synapse page <https://www.synapse.org/#!Synapse:syn2821735/wiki/>`_. Templates are also provided within the source code on `github dreamtools <https://github.com/dreamtools>`_ in the directory dreamtools/dream7/D7C1/templates. D7C1 scoring function are also included in the standalone code **dreamtools-scoring**. For the details of the scoring functions, please refer to the paper (see module documentation) Some details are provided in the methods themselves as well. There are other methods (starting with leaderboard) that should not be used. Those are draft version used to compute pvalues and report scores as in the final leaderboard. .. note:: the scoring functions were implemented following Pablo Meyer's matlab **codescore_dream7_c1s1.m** For admin only: put the submissions in ./submissions/ directory and call the :meth: """ def __init__(self, verbose=True, download=True, path='submissions', **kargs): """.. rubric:: constructor :param path: path to a directory containing submissions :return: """ Challenge.__init__(self, 'D7C1', verbose, download, **kargs) self.sub_challenges = ['parameter', 'topology', 'timecourse'] self.path = path teams = glob.glob(path + os.sep +'*') self.teams = [this for this in teams if os.path.isdir(this) is True] self.N = len(self.teams) self.distances = np.zeros((self.N, 4)) self.pvalues = np.zeros((self.N, 4)) self.scores = {} self._load_gold_standard() #self.load_submissions() #self.compute_score_distance_model1() #self.compute_score_parameter_prediction_model1() #self.compute_score_topology() # data structure to store null distances self.rdistance_pred1 = [] self.rdistance_param1 = []
[docs] def download_template(self, name): """Return filename of a template :param str name: one in 'topology', 'parameter', 'timecourse' """ if name == 'parameter': return self.getpath_template('model1_parameters_alphabeta.txt') elif name == 'topology': return self.getpath_template('network_topology_alphabeta.txt') elif name == 'timecourse': return self.getpath_template('model1_timecourse_alphabeta.txt') else: raise ValueError("Incorrect challenge name. Use one of %s " % self.sub_challenges)
[docs] def load_submissions(self): """Load a bunch of submissions to be found in the submissions directory The directory name is defined in :attr:`path` :return: nothing. Populates :attr:`data` attribute and :attr:`team_names`. """ tag_param1 = 'dream7_netparinf_parameters_model_1' tag_param2 = 'dream7_netparinf_parameters_model_2' tag_pred1 = 'dream7_netparinf_timecourse_model_1' tag_topo2 = 'dream7_netparinf_networktopo_model_2' self.data = {} self.data['param1'] = {} self.data['param2'] = {} self.data['pred1'] = {} self.data['topo2'] = {} self.team_names = [] for team in self.teams: team_name = team.split(os.sep)[1] self.team_names.append(team_name) filename = team + os.sep + tag_param1 + '_' + team_name + '.txt' self.data['param1'][team_name] = self._read_df(filename, 'param') filename = team + os.sep + tag_param2 + '_' + team_name + '.txt' self.data['param2'][team_name] = self._read_df(filename, 'param') filename = team + os.sep + tag_pred1 + '_' + team_name + '.txt' self.data['pred1'][team_name] = self._read_df(filename, 'pred') filename = team + os.sep + tag_topo2 + '_' + team_name + '.txt' self.data['topo2'][team_name] = self._read_df(filename, 'topo')
def _read_df(self, filename, mode, sep='\s+'): def error_message(msg): txt = 'Error while parsing %s\n' % filename txt += msg return txt if mode == 'param': df = pd.read_csv(filename, sep=sep, index_col=0, header=None, names=['values']) elif mode == 'pred': df = pd.read_csv(filename, sep=sep, index_col=0) assert all(df.columns == ['p3','p5','p8']), \ error_message('Header columns must be p3 p5 p8. Found %s' % df.columns) elif mode == 'topo': df = pd.read_csv(filename, sep=sep, index_col=None, header=None, names=['regulator','sign1','g1','sign2','g2']) return df ########################################################### standalone scoring functions:
[docs] def score(self, filename, subname=None): """Return score for a given sub challenge :param str filename: input filemame. :return: name of a sub_challenge. See :attr:`sub_challenges` attribute. """ if subname == 'parameter': score = self.score_model1_parameters(filename) pvalue = self.get_pvalues_parameter(score) over = -np.log10(pvalue) return {'score':score, 'pvalue':pvalue, 'overallScore':over} elif subname == 'timecourse': score = self.score_model1_timecourse(filename) pvalue = self.get_pvalues_timecourse(score) over = -np.log10(pvalue) return {'score':score, 'pvalue':pvalue, 'overallScore':over} elif subname == 'topology': print("Note: Null distribution used to compute p-value and overall score is stochastic " + "but should be close to the one used in the official LB.\n") score = self.score_topology(filename) pvalue = self.get_pvalues_topology(score) over = -np.log10(pvalue) return {'score':score, 'pvalue':pvalue, 'overallScore':over} else: raise ValueError("Incorrect challenge name. Use one of %s " % self.sub_challenges)
[docs] def score_model1_parameters(self, filename): r"""Return distance between submission and gold standard for parameters challenge (model1) :param filename: must be valid templates :return: score (distance) :: >>> from dreamtools import D7C1 >>> s = D7C1() >>> filename = s.download_template('parameter') >>> s.score(filename, 'parameter') 0.022867555017785129 The score is computed using the square of the ratio of the user prediction and the gold standard. Taking the mean of the log10 : .. math:: S = \overline{\log10 \left( \left( \frac{X}{X_{\rm{gold\;standard}}} \right)^2\right)} """ data = self._read_df(filename, mode='param') distance = self._compute_score_model1_parameters(data) return distance
[docs] def score_model1_timecourse(self, filename): r"""Returns distance between prediction and gold standard (model1) :param filename: must be valid templates :return: score (distance) :: >>> from dreamtools import D7C1 >>> s = D7C1() >>> filename = s.download_template('timecourse') >>> s.score_model1_timecourse(filename) 0.0024383612676804048 There are 3 time courses to be predicted. The score for each time course is .. math:: S_i = \frac{(X_i - \hat{X_i}) ^ 2}{0.01 + 0.04 * X_i^2} where :math:`X` is the gold standard and :math:`\hat{X}` the prediction. and final score is just the average across the 3 time courses. """ data = self._read_df(filename, mode='pred') distance = self._compute_score_timecourse_model1(data, 10,39) return distance
[docs] def score_topology(self, filename): """Standalone version of the network topology scoring :param str filename: :: >>> from dreamtools import D7C1 >>> s = D7C1() >>> filename = s.download_template('topology') >>> s.score(filename, 'topology') 12 :scoring details: The challenge requests predictions for 3 missing links, knowing that a gene can regulate up to two genes when they are in the same operon, 6 gene interactions have to be indicated by the participants (3 links*2 genes) and whether these interactions are activating (+) or repressing (-). For each of the predicted links i=1,2,3, we define a score: :math:`S_i^{link} = L_i + N_i` where :math:`L_j` is 6 if the nature of the regulation iscorrect (that is, the source gene, the sign of the connection, and the destination gene are all correct) and :math:`L_i = 12` if the link regulates an operon composed of two genes and both connections are correct. If :math:`L_i >0` then :math:`N_i=0`. In case a link is NOT correctly predicted (:math:`L_i=0`) :math:`N_i` is defined as follows. It is increased by 1 for each correctly regulated gene, 2 if the regulated gene and the nature of the regulation (i.e +/-) are correct and 1 if the regulator gene is correct The gold standard contains 3 lines similar to :: 5 + 7 + 11 It means gene 5 positively regulates gene 7 and gene 11. If a prediction is :: 5 + 7 + 2 Then L =6. If the prediction is :: 2 + 7 + 2 L = 0 so N may be updated. Here the regulon (2) is not correct, However, one gene (7) is correctly predicted with the good sign so N = 2. """ data = self._read_df(filename, mode='topo') distance = self._compute_score_topology(data) return distance
##################################################################### Load gold standard files
[docs] def download_goldstandard(self, name): if name == 'parameter': return self._get_gs('model1_parameters_answer.txt') elif name == 'timecourse': return self._get_gs('model1_prediction_answer.txt') elif name == 'topology': return self._get_gs('model2_topology_answer.txt') else: raise ValueError("Incorrect challenge name. Use one of %s " % self.sub_challenges)
def _get_gs(self, filename): filename = self.getpath_gs(filename) return filename def _load_gold_standard(self): self.gs = {} self.gs['param1'] = self._read_df(self._get_gs("model1_parameters_answer.txt"), mode='param') self.gs['param2'] = self._read_df(self._get_gs("model2_parameters_answer.txt"), mode='param') self.gs['pred1'] = self._read_df(self._get_gs("model1_prediction_answer.txt"), mode='pred') self.gs['topo2'] = self._read_df(self._get_gs("model2_topology_answer.txt"), mode='topo') ################################## compute all submissions from challenge
[docs] def leaderboard_compute_score_topology(self): """Computes all scores (topology) for loaded submissions For the metric, see :meth:`score_topology`. :return: fills :attr:`scores`. .. seealso:: :meth:`load_submissions` """ scores = {} for team in self.team_names: data = self.data['topo2'][team] score = self._compute_score_topology(data, team=team) scores[team] = score df = pd.TimeSeries(scores) df = pd.DataFrame({'scores':df, 'rank':df.rank()}) self.scores['topo2'] = df.sort(columns='rank')
[docs] def leaderboard_compute_score_parameters_model1(self): """Computes all scores (parameters model1) :return: Nothing but fills :attr:`scores`. For the metric, see :meth:`score_model1_parameters`. .. seealso:: :meth:`load_submissions` """ # parameter model1 scores = {} for team in self.team_names: data = self.data['param1'][team] score = self._compute_score_model1_parameters(data) scores[team] = score df = pd.TimeSeries(scores) df = pd.DataFrame({'scores':df, 'rank':df.rank()}) self.scores['param1'] = df.sort(columns='rank')
[docs] def leaderboard_compute_score_timecourse_model1(self, startindex=10, endindex=39): """Computes all scores (timecourse model1) :return: Nothing but fills :attr:`scores` For the metric, see :meth:`score_model1_parameters`. Note that *endindex* is set to 39 so it does not take into account last value at time=20 This is to be in agreement with the implemenation used in the final leaderboard https://www.synapse.org/#!Synapse:syn2821735/wiki/71062 If you want to take into account final point, set endindex to 40 """ scores = {} for team in self.team_names: data = self.data['pred1'][team] scores[team] = self._compute_score_timecourse_model1(data, startindex, endindex) self.scores['pred1'] = pd.TimeSeries(scores) self.scores['pred1'].sort() self.scores['pred1'] = self.scores['pred1'].to_frame() self.scores['pred1'].columns = ['scores']
def _compute_score_model1_parameters(self, data): diff = data / self.gs['param1'] diff = (np.log10(diff)**2).mean() score = diff.values[0] # should be a single float return score def _compute_score_timecourse_model1(self, data, startindex, endindex): d1 = (self.gs['pred1'] - data) ** 2 d1 /= (0.01 + 0.04 * self.gs['pred1'].values**2) # let us ignore the first 10 points # faster to use numpy array and indices data = d1.values[startindex:endindex+1,:] N = endindex - startindex + 1. distance = np.sum(data) / (3*N) # normalisation return distance ##################################### NULL distribution (draft do not use) def _get_random_parameters_model1(self, N=10000): """Null distribution for the model1 parameter :param N: number of distribution :param int Nbest: In the official challenge, 12 submissions wre provided. Here, we use only the 9 best submissions like in the paper. :return: a dataframe with null distributions .. note:: submissions are loaded and scored before creating the null distributions. """ self.load_submissions() self.leaderboard_compute_score_parameters_model1() # select only best 9 as in the paper Nbest = 9 best_teams = list(self.scores['param1'].ix[0:Nbest].index) # create a dataframe to hold all teams and p = pd.DataFrame(dict([(key, self.data['param1'][key].T.values[0]) for key in best_teams])) nulls = [] for k in xrange(0, 45): null = p.ix[k][np.random.randint(0, Nbest, N)] nulls.append(null) indexnames = list(self.gs['param1'].index) df = pd.DataFrame(dict([(name, nulls[i].values) for i, name in enumerate(indexnames)])) df = df[self.gs['param1'].index] return df
[docs] def get_null_parameters_model1(self, N=10000): """Returns score distribution (parameter model1)""" df = self._get_random_parameters_model1(N=N) distances =[] from easydev import progress_bar pb = progress_bar(N) for i in xrange(0, N): df1 = df.ix[i].to_frame(name='values') distance = self._compute_score_model1_parameters(df1) distances.append(distance) pb.animate(i+1) return distances
[docs] def get_pvalues_parameter(self, score): filename = self._pj([self.classpath, 'data', 'D7C1_param_proba.npy']) data = np.load(filename) X = data[:, 0] Y = data[:, 1] return sum(Y[X<score]) * (X[2]-X[1])
def _get_random_timecourse_model1(self, N=10000): """Null distributions for the model1 timecourse challenge :param N: number of random models :return: a dataframe with the null distribution """ # In the official LB, 12 submissions wre provided. Here, we use only the 9 best submissions # like in the paper. self.load_submissions() self.leaderboard_compute_score_timecourse_model1() Nbest = 11 best_teams = list(self.scores['pred1'].ix[0:Nbest].index) # data mangling to extract random values easily p3 = pd.DataFrame(dict([(key, self.data['pred1'][key]['p3']) for key in best_teams])) p5 = pd.DataFrame(dict([(key, self.data['pred1'][key]['p5']) for key in best_teams])) p8 = pd.DataFrame(dict([(key, self.data['pred1'][key]['p8']) for key in best_teams])) df = self.gs['pred1'].copy() data = np.zeros((41,3, N)) for ik,k in enumerate(list(self.gs['pred1'].index)): data[ik,0] = p3.ix[k][np.random.randint(0,Nbest,N)] data[ik,1] = p5.ix[k][np.random.randint(0,Nbest,N)] data[ik,2] = p8.ix[k][np.random.randint(0,Nbest,N)] return data
[docs] def get_null_timecourse_model1(self, N=10000): data = self._get_random_timecourse_model1(N=N) distances = [] from easydev import progress_bar pb = progress_bar(N) for i in xrange(0,N): df = data[:,:,i] # FIXME those values 10,39 should not be hardcoded distance = self._compute_score_timecourse_model1(df, 10,39) distances.append(distance) pb.animate(i) return distances
[docs] def get_pvalues_timecourse(self, score): # The code that computs the p-values does not provide similiar results as in the LB # # for now, we use the LB scores and pvalues and interpolate scores = np.array([0.002438361, 0.016023721, 0.035404398, 0.047495432, 0.09791128, 0.198785197, 0.356429217, 0.362463945, 0.817972877, 3.222767988, 14.77443631, 19.32326868]) pvalues = np.array([1.21E-25, 3.39E-18, 4.45E-15, 6.28E-14, 4.01E-11, 1.93E-08, 2.53E-06, 2.90E-06, 1.34E-03, 6.90E-01, 1.00E+00, 1.00E+00]) return np.interp(score, scores, pvalues)
def _leaderboard_compute_overall_score(self, N=100): """Based on NULL distribution, compute overall score of model1 Not finalised. """ self._compute_pvalues_pred1(N=N) self._compute_pvalues_param1(N=N) import fitter fit_param1 = fitter.Fitter(self.rdistance_param1) fit_param1.distributions = ['beta'] fit_param1.fit() fit_pred1 = fitter.Fitter(self.rdistance_pred1) fit_pred1.distributions = ['beta'] fit_pred1.fit() import scipy.stats self.pvalues_param1 = scipy.stats.beta.cdf(self.scores['param1'].scores, *fit_param1.fitted_param['beta']) self.pvalues_pred1 = scipy.stats.beta.cdf(self.scores['pred1'].scores, *fit_pred1.fitted_param['beta']) self.scores['pred1']['pvalues'] = self.pvalues_pred1 self.scores['param1']['pvalues'] = self.pvalues_param1
[docs] def leaderboard(self): """Computes all scores for all submissions and returns dataframe :return: dataframe with scores for each submissions for the model1 (parameter and timecourse) and model2 (topology) """ self.load_submissions() self.leaderboard_compute_score_parameters_model1() self.leaderboard_compute_score_topology() self.leaderboard_compute_score_timecourse_model1(10,39) df = pd.merge(self.scores['param1'], self.scores['pred1'], left_index=True, right_index=True, suffixes=['_parameter', '_timecourse']) df = pd.merge(df, self.scores['topo2'], left_index=True, right_index=True ) df.columns = [c if c!='scores' else 'scores_topology' for c in df.columns] return df
#### Topology related def _compute_score_topology(self, data, team=''): """see :meth:`score_topology` for details """ data = data.copy() # make sure we do not change the data so use copy() gs = self.gs['topo2'].copy() Li = np.array([0,0,0]) Ni = np.array([0,0,0]) # loop over solution cols1 = ['regulator', 'sign1', 'g1'] cols2 = ['regulator', 'sign2', 'g2'] # build list of existing genes and their signs from the GS # order does not matter, this is for th counting of Ni genes = np.hstack([gs.g1.values , gs.g2.values]) signs = np.hstack([gs.sign1.values , gs.sign2.values]) # make sure there are unique regulators_data = set(data.regulator) for i in range(0,3): for j in range(0,3): # if all 5 values are correct, L = 12 and stops there # Note that is the regulated gene is zero, it means # it does not exists so it is ignored. if all(gs.values[i][[0,1,2]] == data.values[j][[0,1,2]]) is True: # g1 should be different from 0 # regulator is tested as well although the GS are no such case if gs.ix[i]['g1'] != 0 and gs.ix[i]['regulator'] != 0 : Li[i] += 6 data.iloc[j,0] = 0 # index 0 means first columns that is the regulator data.iloc[j,2] = 0 # index 4 means gene1 if all(gs.values[i][[0,3,4]] == data.values[j][[0,3,4]]) is True: # g1 should be different from 0 # regulator is tested as well although the GS are no such case if gs.ix[i]['g2'] != 0 and gs.ix[i]['regulator'] != 0 : Li[i] += 6 data.iloc[j,0] = 0 # index 0 means first columns that is the regulator data.iloc[j,4] = 0 # index 4 means gene2 # any regulator correctly predicted ? for reg in set(data.regulator): if reg !=0 and reg in gs.regulator.values: Ni[i] += 1 genes_data = np.hstack([data.g1.values , data.g2.values]) signs_data = np.hstack([data.sign1.values , data.sign2.values]) import collections dgenes = collections.defaultdict(list) for gene, sign in zip(genes,signs): dgenes[gene].append(sign) dgenes_data = collections.defaultdict(list) for gene, sign in zip(genes_data,signs_data): dgenes_data[gene].append(sign) # here this is ambigous: a gene may have 2 opposite signs... To be in agreement with original code # from pablo that was used in the dream challenge, we keep the latest values found in the submissions. # this is arbitrary though. #for key in dgenes_data.keys(): # if dgenes_data. # Why do we get rid of the gene 0 ? Probably gens are implemented as values # from 1 to 11 and 0 means to interactions try: del dgenes[0] except: pass try: del dgenes_data[0] except: pass # any gene and sign found in GS? for gene, signs in dgenes_data.items(): if gene in dgenes.keys(): Ni[i] += 1 if signs[-1] in dgenes[gene]: Ni[i] += 1 Si = Li + Ni return sum(Si)
[docs] def get_random_topology(self): # Create a random topology submission data = pd.DataFrame() sign1 = ["+" if x ==1 else "-" for x in np.random.randint(0, 2, 3)] sign2 = ["+" if x ==1 else "-" for x in np.random.randint(0, 2, 3)] regulator = np.random.randint(0, 12, 3) g1 = np.random.randint(0, 12, 3) g2 = np.random.randint(0, 12, 3) data['regulator'] = regulator data['sign1'] = sign1 data['g1'] = g1 data['sign2'] = sign2 data['g2'] = g2 return data
# NULL distribution and p-values for the topology challenge
[docs] def get_null_topology(self, N=10000): """Return null distribution of the topology score""" nulls = [self._compute_score_topology(self.get_random_topology()) for i in range(0, N)] return nulls
[docs] def get_pvalues_topology(self, x): """Return pvalues of a given score (topology challenge)""" # pvalues are hardcoded and were computed using N=100000 values from :meth:`get_null_topo` # nulls = s.get_null_prop(100000) # Y, X = np.histogram(nulls, bins=range(0,20), normed=True) # X = X[1:] Y = np.array([ 1.65600000e-02, 7.03700000e-02, 1.54750000e-01, 2.15260000e-01, 2.12310000e-01, 1.53090000e-01, 8.32200000e-02, 3.86100000e-02, 2.12100000e-02, 1.48600000e-02, 9.95000000e-03, 5.78000000e-03, 2.32000000e-03, 8.60000000e-04, 3.90000000e-04, 3.00000000e-04, 1.00000000e-04, 4.00000000e-05, 2.00000000e-05]) X = np.array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) return sum(Y[X>=x] * 1)