Source code for dreamtools.dream6.D6C3.scoring

"""D6C3 scoring function

Based on Pablo's Meyer Matlab code.
"""
import os
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np
import glob

[docs]class D6C3(Challenge):
    """A class dedicated to D6C3 challenge

    ::

        from dreamtools import D6C3
        s = D6C3()
        filename = s.download_template()
        s.score(filename)

    Absolute score in the Pearson coeff but other scores such as
    chi-square and rank are based on the 21st participants.

    Pearson and spearman gives same values as in final LB but X2 and R2
    are slightly different. Same results as in the original matlab scripts
    so the different with the LB is probably coming fron a different
    set of predictions files, which is stored in ./data/predictions
    and was found in http://genome.cshlp.org/content/23/11/1928/suppl/DC1

    The final score in the official leaderboard computed the p-values for
    each score (chi-square, r-square, spearman and pearson correlation
    coefficient) and took -0.25 log ( product of p-values) as the final score.

    """
    def __init__(self, verbose=True, download=True, **kargs):
        """.. rubric:: constructor

        """
        super(D6C3, self).__init__('D6C3', verbose, download, **kargs)
        self.sub_challenges = []

[docs]    def score(self, filename):

        self.read_all_participants()

        gs_filename = self.download_goldstandard()
        self.goldstandard = pd.read_csv(gs_filename, sep='\t', header=None)
        self.goldstandard.set_index(0, inplace=True)

        self.prediction = pd.read_csv(filename, sep='\t', header=None)
        self.prediction.set_index(0, inplace=True)

        # some aliaes
        pred = self.prediction
        gs = self.goldstandard
        Spred = self.prediction.sum()
        Sgs = self.goldstandard.sum()

        # Compute the Pearson coeff
        d1 = ((pred - Spred/53.)**2).sum()  # variance participant
        d2 = ((gs - Sgs/53.)**2).sum()      # variance GS
        Cp = (pred * gs).sum() - Spred * Sgs / 53.
        Cp /= np.sqrt(d1*d2)
        Cp = Cp.values[0]

        # using Scipy gives also the p-values
        # scipy.stats.pearsonr(s.prediction, s.goldstandard)
        import scipy.stats
        Sp = scipy.stats.spearmanr(pred, gs)[0]
        # could replace code above to compute pearson by this code:
        #Cp2 = scipy.stats.pearsonr(pred,gs)[0][0]

        # The ChiSqaure
        # Same results as Kelly's score but different from LB on synapse
        chi = 0
        for j in range(0,53):
            num = (pred.ix[j] - gs.ix[j])**2
            denom = np.mean((self.alldata.ix[j] - gs.ix[j].values)**2)
            chi += num / denom

        tiedrank = self.goldstandard.rank()   # if tied, average ranks

        # R-square
        r2 = 0
        pt = 21
        allranks = self.alldata.rank()
        for i in range(0, 53):
            indx = pred.rank()
            rn1 = indx.ix[i] - tiedrank.ix[i]
            rs2 = 0
            for j in range(0, pt):
                rd1 = allranks.ix[i,j] - tiedrank.ix[i]
                rs2 = rs2 + rd1*rd1
            rs2 = rs2 / float(pt)
            r2 = r2 + (rn1*rn1) / rs2

        results = pd.Series()
        results['chi2'] = chi.values[0]
        results['R-square'] = r2.values[0]
        results['Spearman(Sp)'] = Sp
        results['Pearson(Cp)'] = Cp
        return {'results':results}

[docs]    def read_all_participants(self):
        path = self._pj([self.classpath, 'data', 'predictions'])
        filenames = glob.glob(path + os.sep + '*txt')
        assert len(filenames) == 21
        data = [pd.read_csv(filenames[i], sep='\t', header=None)[1] for
            i in range(0,21)]
        self.alldata = pd.DataFrame(data).T
        self.alldata.columns = range(0,21)

[docs]    def download_template(self):
        return self.getpath_template('D6C3_template.txt')

[docs]    def download_goldstandard(self):
        return self.getpath_gs('D6C3_goldstandard.txt')