Source code for dreamtools.dream3.D3C3.scoring

"""D3C3 scoring function

Original matlab version (Gustavo A. Stolovitzky, Ph.D. Robert Prill) translated
into Python by Thomas Cokelaer.
"""
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np



[docs]class D3C3(Challenge):
    """A class dedicated to D3C3 challenge

    ::

        from dreamtools import D3C3
        s = D3C3()
        filename = s.download_template()
        s.score(filename)

    Data and templates are downloaded from Synapse. You must have a login.

    .. note:: the spearman pvalues are computed using R and are slightly 
        different from the official code that used matlab. The reason 
        being that the 2 implementations are different. Pleasee see cor.test 
        in R and corr() function in matlab for details.
        The scipy.stats.stats.spearman has a very different implementation 
        for small size cases.

    """
    def __init__(self, verbose=True, download=True, **kargs):
        """.. rubric:: constructor

        """
        super(D3C3, self).__init__('D3C3', verbose, download, **kargs)

        #:SubChallenges: different network size (10, 50, 100)

[docs]    def download_goldstandard(self):
        return self._pj([self.classpath, 'goldstandard', 
            'D3C3_goldstandard.txt'])

[docs]    def score(self, filename):
        self.G = self._read_challenge(self.download_goldstandard())
        self.T = self._read_challenge(filename)

        #from scipy.stats.stats import pearsonr, spearmanr
        G = self.G[self.G.columns[2:]].values
        T = self.T[self.T.columns[2:]].values

        # Using scipy, the pvalue are not the same as in matlab for several reasons.
        # first scipy returns only 2-tail pvalue but more importantly, it is
        # a rough approximation as mentionned in their doc and when compared
        # to matlab differences can be large. So, we use R, which results are
        # also differnt but much close (1-2% different
        """data = [spearmanr(G[i,:], T[i,:]) for i in range(0,50)]
        rho_row = [x[0] for x in data]
        pval_row = [x[1] for x in data]
        # row correlation
        data = [spearmanr(G[:,i], T[:,i]) for i in range(0,8)]
        rho_col = [x[0] for x in data]
        pval_col = [x[1] for x in data]
        """

        from dreamtools.core.rtools import RTools
        rtool = RTools(verboseR=False)
        pval_row = []
        pval_col = []
        rho_row = []
        rho_col = []

        for i in range(0, 50):
            rtool.session.t = T[i, :].copy()
            rtool.session.g = G[i, :].copy()
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater', exact=F)")
            rho_row.append(rtool.session.results['estimate'])
            pval_row.append(rtool.session.results['p.value'])
        for i in range(0, 8):
            rtool.session.t = T[:, i].copy()
            rtool.session.g = G[:, i].copy()
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater', exact=F)")
            rho_col.append(rtool.session.results['estimate'])
            pval_col.append(rtool.session.results['p.value'])


        print("""
WARNING: the spearman correlation pvalue are computed using R. Pvalues are
slightly different from those computed using matlab and therefore the final
values may differ by a few percents to the pvlues reported in the original
challenge. \n""")

        self.rho_col = rho_col
        self.pval_col = pval_col
        self.rho_row = rho_row
        self.pval_row = pval_row

        score1 = np.exp(np.nansum(np.log(pval_row))/50)
        score2 = np.exp(np.nansum(np.log(pval_col))/8.)

        score = sum(-np.log10([score1, score2]))/2.
        return {'score': score}

[docs]    def download_template(self):
        return self._pj([self.classpath, 'templates', 'D3C3_template.txt'])

    def _read_challenge(self, filename):
        df = pd.read_csv(filename, sep='\t')
        return df