Source code for dreamtools.dream5.D5C3.scoring

"""D5C3 scoring function

Original matlab code from Gustavo A. Stolovitzky,  Robert Prill, Ph.D.
sub challenge B original code in R from A. de la Fuente

"""
import os
import random
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np
from dreamtools.core.rocs import D3D4ROC


__all__ = ["D5C3"]


[docs]class D5C3(Challenge, D3D4ROC):
    """A class dedicated to D5C3 challenge

    ::

        from dreamtools import D5C3
        s = D5C3()
        filename = s.download_template()
        s.score(filename)

    Data and templates are downloaded from Synapse. You must have a login.

    3 subchallenges (A100, A300, A999) but also 3 others simpler with
    B1, B2, B3

    For A series, 5 networks are required. For B, 3  are needed.

    """
    def __init__(self, verbose=True, download=True, **kargs):
        """.. rubric:: constructor

        """
        super(D5C3, self).__init__('D5C3', verbose, download, **kargs)
        self._init()
        self.sub_challenges = ['A100', 'A300', 'A999', 'B']
        self.N_pvalues = 100

    def _init(self):
        # should download files from synapse if required.
        # The templates
        if self._standalone is True:
            return

        prefix = 'DREAM5_SysGen'
        self._download_data(prefix + 'A100_myteam_Network1.txt', 'syn4561478')
        self._download_data(prefix + 'A100_myteam_Network2.txt', 'syn4561482')
        self._download_data(prefix + 'A100_myteam_Network3.txt', 'syn4561486')
        self._download_data(prefix + 'A100_myteam_Network4.txt', 'syn4561489')
        self._download_data(prefix + 'A100_myteam_Network5.txt', 'syn4561493')

        # Get goldstandard and unpack zipped files
        self._download_data('D5C3_goldstandard.zip', 'syn4561554')
        self.unzip('D5C3_goldstandard.zip')

        # the probabilities for sub challenge A
        self._download_data('D5C3_probabilities.zip', 'syn4562041')
        self.unzip('D5C3_probabilities.zip')

    def _check_filename(self, filename):
        end = filename[-5:]
        if end[1] != '.':
            raise ValueError("File must end with a suffix of 3 letters e.g. .csv")
        end = end[0]
        if end not in ['1', '2', '3', '4', '5']:
            raise ValueError("File must end with a valid batch value in 1,2,3,4,5")
        return end

    def _check_subname(self, subname):
        if subname not in self.sub_challenges:
            raise ValueError("sub-challenge must be in %s" % self.sub_challenges)

    def _check_sub_challenge_name(self, name):
        assert name in self.sub_challenges

[docs]    def download_template(self, subname):
        # for B1, B2, B3, returns a single file
        # for A100, A300, A999 as well but to indicate the network,
        # append _1, _2, ..._5
        # if not, the network _1 is returned only
        # there is no A300 or a999 template
        if subname in ['A300', 'A999']:
            print("Warning:: there is no A300 or A999 template per se. " +
                    "Use the goldstandard instead")
            subname = 'A100'

        self._check_subname(subname)
        if subname == 'B':
            filenames = [self._pj([self.classpath, 'templates',
                                   "DREAM5_SysGenB1_your_Predictions.txt" ])]
            filenames.append(self._pj([self.classpath, 'templates',
                                   "DREAM5_SysGenB2_your_Predictions.txt" ]))
            filenames.append(self._pj([self.classpath, 'templates',
                                   "DREAM5_SysGenB3_your_Predictions.txt" ]))
        else:
            filenames = []
            for i in range(1, 6):
                gs_filename = self.get_pathname('DREAM5_SysGen%s_myteam_Network%s.txt' % (subname, i))
                filenames.append(gs_filename)
        return filenames

[docs]    def download_goldstandard(self, subname):
        # for B1, B2, B3, returns a single file
        # for A100, A300, A999 as well but to indicate the network, append _1, _2, ..._5
        # if not, the network _1 is returned only
        self._check_subname(subname)
        if subname == 'B':
            filenames = [self._pj([self.classpath, 'goldstandard',
                "DREAM5_SysGenB1_TestPhenotypeData.txt" ])]
            filenames.append(self._pj([self.classpath, 'goldstandard',
                "DREAM5_SysGenB2_TestPhenotypeData.txt"]))
            filenames.append(self._pj([self.classpath, 'goldstandard',
                "DREAM5_SysGenB3_TestPhenotypeData.txt"]))
            return filenames
        else:
            filenames = []
            for i in range(1, 6):
                gs_filename = self.get_pathname('DREAM5_SysGen%s_Edges_Network%s.tsv' % (subname, i))
                filenames.append(gs_filename)
            return filenames

    def _load_network(self, filename):
        df = pd.read_csv(filename, header=None, sep='[ \t]', engine='python')
        df[0] = df[0].apply(lambda x: x.replace('g', '').replace('G', ''))
        df[1] = df[1].apply(lambda x: x.replace('g', '').replace('G', ''))
        df = df.astype(float) # imoprtant for later to check for equality
        return df

[docs]    def score(self, filename, subname):
        self._check_subname(subname)
        if subname in ['A100', 'A300', 'A999']:
            return self._score_challengeA_bunch(filename, subname)
        elif subname == 'B':
            if isinstance(filename, list) is False:
                raise TypeError('Challenge B expects 3 input files B1, B2, B3')
            if len(filename)!=3:
                raise TypeError('Challenge B expects 3 input files B1, B2, B3')
            return self.score_challengeB(filename)
        else:
            raise ValueError('Sub challenge must be either A or B')

    def _score_challengeA_bunch(self, filenames, subname):

        from easydev import Progress
        pb = Progress(5, 1)
        pb.animate(0)
        results = []
        for i, filename in enumerate(filenames):
            res  = self.score_challengeA(filename, subname+"_" + str(i+1))
            pb.animate(i+1)
            results.append(res)

        aupr_score = -np.mean(np.log10([x['p_auroc'] for x in results]))
        auroc_score = -np.mean(np.log10([x['p_aupr'] for x in results]))
        score = (aupr_score + auroc_score)/2.

        df = pd.Series()
        df['Overall Score'] = score
        df['AUPR score (pval)'] = aupr_score
        df['AUROC score (pval)'] = aupr_score

        for i in range(1, 6):
            df['AUPR Net %s' % i] = results[i-1]['aupr']

        for i in range(1, 6):
            df['AUROC Net %s' % i] = results[i-1]['auroc']

        return df

[docs]    def score_challengeA(self, filename, subname):

        name1, name2 = subname.rsplit("_",1)
        goldfile = self.download_goldstandard(name1)[int(name2)-1]

        # gold standard edges only
        predictionfile = filename

        # precomputed probability densities for various metrics
        pdffile_aupr  = self.get_pathname(name1 + os.sep+ 'Network%s_AUPR.mat' % (name2))
        pdffile_auroc = self.get_pathname(name1+os.sep+ 'Network%s_AUROC.mat'% (name2))
        # load probability densities
        pdf_aupr  = self.loadmat(pdffile_aupr)
        pdf_auroc = self.loadmat(pdffile_auroc)

        self.pdf_auroc = self.loadmat(pdffile_auroc)
        self.pdf_aupr = self.loadmat(pdffile_aupr)

        # load gold standard
        self.gold_edges = self._load_network(goldfile)

        # load predictions
        self.prediction = self._load_network(predictionfile)

        # DISCOVERY
        # In principle we could resuse ROCDiscovery class but
        # here the pvalues were also computed. let us do it here for now

        merged = pd.merge(self.gold_edges, self.prediction, 
                how='inner', on=[0,1])
        self.merged = merged

        TPF = len(merged)
        # unique species should be 1000
        N = len(set(self.gold_edges[0]).union(self.gold_edges[1]))
        # positive
        Pos = len(self.gold_edges)
        # negative
        Neg = N*N-N-Pos
        # total
        Ntot = Pos + Neg

        L = len(self.prediction)

        discovery = np.zeros(L)
        values_gs =  [tuple(x) for x in merged[[0,1]].values]
        values_pred = [tuple(x) for x in self.prediction[[0,1]].values]
        count = 0
        for i in range(0, L):
            if values_pred[i] in values_gs:
                discovery[count] = 1
                # else nothing to do (vector is filled with zeros
            count += 1
        TPL = sum(discovery)

        self.discovery = discovery

        if L < Ntot:
            p = (Pos - TPL) / float(Ntot - L)
        else:
            p = 0

        random_positive_discovery = [p] * (Ntot - L)
        random_negative_discovery = [1-p] * (Ntot - L)

        # append discovery + random using lists
        positive_discovery = np.array(list(discovery)
                + random_positive_discovery)
        negative_discovery = np.array(list(1-discovery)
                + random_negative_discovery)

        #  true positives (false positives) at depth k
        TPk = np.cumsum(positive_discovery)
        FPk = np.cumsum(negative_discovery)

        #  metrics
        TPR = TPk / float(Pos)
        FPR = FPk / float(Neg)
        REC = TPR  # same thing
        PREC = TPk / range(1, Ntot+1)

        from dreamtools.core.rocs import ROCBase
        roc = ROCBase()
        auroc = roc.compute_auc(roc={'tpr':TPR, 'fpr':FPR})
        aupr = roc.compute_aupr(roc={'precision':PREC, 'recall':REC})

        # normalise by max possible value
        aupr /= (1.-1./Pos)

        p_aupr = self._probability(pdf_aupr['X'][0], pdf_aupr['Y'][0], aupr)
        p_auroc = self._probability(pdf_auroc['X'][0], pdf_auroc['Y'][0],
                auroc)

        results = {'auroc':auroc, 'aupr':aupr, 'p_auroc':p_auroc,
                'p_aupr':p_aupr}
        return results

[docs]    def score_challengeB(self, filenames):
        # Ideally provide 3 filenames but if only 1 is given, try
        # to infer the names of the 2 others
        cor_pheno1 = []
        cor_pheno2 = []
        pval_pheno1 = []
        pval_pheno2 = []
        scores = []
        from dreamtools.core.rtools import RTools
        rtool = RTools(verboseR=False)

        assert len(filenames) == 3, "Must provide 3 files"

        self.golds = []
        self.preds = []
        gold_filenames = self.download_goldstandard('B')
        print("Warning: your 3 submissions should be ordered as B1, B2, B3 files")

        for tag in [1, 2, 3]:
            #assumeing data and gs are sorted in the same way !!
            gold = pd.read_csv(gold_filenames[tag-1], sep='[ \t]', 
                    engine='python')
            self.golds.append(gold)

            #filename = 'DREAM5_SysGenB%s_your_Predictions.txt' % tag
            #filename = self._pj([self.classpath, 'data', filename])
            filename = filenames[tag-1]
            pred1 = pd.read_csv(filename, sep='[ \t]', engine='python')
            self.preds.append(pred1)

            # correlation gs versus predicted
            rtool.session.t = pred1.ix[0].values
            rtool.session.g = gold.ix[0].values
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
            T1 = rtool.session.results.copy()

            rtool.session.t = pred1.ix[1].values
            rtool.session.g = gold.ix[1].values
            rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
            T2 = rtool.session.results.copy()
            cor_pheno1.append(T1['estimate'])
            cor_pheno2.append(T2['estimate'])
            pval_pheno1.append(T1['p.value'])
            pval_pheno2.append(T2['p.value'])

            score = -(np.log(T1['p.value']) + np.log(T2['p.value']))
            scores.append(score)

        self.corp1 = cor_pheno1
        self.corp2 = cor_pheno2
        self.pval1 = pval_pheno1
        self.pval2 = pval_pheno2
        self.scores = scores

        # This part now compute the pvalues using random prediction
        random_scores = {0:[],1:[],2:[]}

        from easydev import Progress
        pb = Progress(self.N_pvalues, interval=1)

        for ii in range(1, self.N_pvalues):
            for tag in [0,1,2]:
                #generate random coordinates
                coord = random.sample(['RIL%s' % i for i in range(1,31)], 30)
                coord2 = random.sample(['RIL%s' % i for i in range(1,31)], 30)

                # Obtaining random scores
                rtool.session.t = self.preds[tag].ix[0].ix[coord].values
                rtool.session.g = self.golds[tag].ix[0].values
                rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
                T1 = rtool.session.results.copy()
                rtool.session.t = self.preds[tag].ix[1].ix[coord2].values
                rtool.session.g = self.golds[tag].ix[1].values
                rtool.session.run("results = cor.test(t, g, method='spearman', alternative='greater')")
                T2 = rtool.session.results.copy()

                random_scores[tag].append(-(np.log(T1['p.value']) + np.log(T2['p.value'])))
            pb.animate(ii+1)
        self.random_scores = random_scores
        #Obtaining p-values
        pvals = [sum(self.random_scores[k]>= self.scores[k])/float(self.N_pvalues)
                for k in [0,1,2]]
        self.pvals = pvals

        df = pd.DataFrame({'scores':self.scores,
            'correlation_phenotype1':cor_pheno1,
            'correlation_phenotype2':cor_pheno2,
            'pvalues_phenotype1':pval_pheno1,
            'pvalues_phenotype2':pval_pheno2,
            'pvalues':self.pvals})
        df= df.T
        df.columns = ['SysGenB1', 'SysGenB2', 'SysGenB3']
        return df

    def _probability(self, X, Y, x):
        dx = X[1] - X[0]
        return  sum(Y[X >= x]) * dx