Source code for dreamtools.dream7.D7C4.scoring

"""


Original code for challenge B translted from  Mukesh Bansal
Sub challenge A is currently a wrapping of a perl code provided by Jim Costello

"""
import os
import math
import random

from dreamtools.core.challenge import Challenge
from easydev import shellcmd
import pandas as pd

import numpy as np


[docs]class D7C4(Challenge):
    """A class dedicated to D7C4 challenge

    ::

        from dreamtools import D7C4
        s = D7C4()
        filename = s.download_template()
        s.score(filename)

    Data and templates are downloaded from Synapse. You must have a login.

    ::

        # columns represent the probabilistic c-index of the given team for
          each drug.
        # following the columns of teams are 5 columns which are used for
          calculating the overall team score
        # |-> Test_data = the probabilistic c-index for the experimentally
          determined test data scored against itself
        # |-> Mean Null Distribution = a set of 10,000 random predictions
          were scored to create the null distribution, of which this column
          represents the mean
        # |-> SD Null Distribution = a set of 10,000 random predictions
          were scored to create the null distribution, of which this column
          represents the standard deviation
        # |-> z-score of test data to null = score of the test data minus
          the mean of the null distribution divided by the standard deviation
          of the null distribution
        # |-> weight of drug (normalized z-score) = the z-score normalized
          by the largest z-score across all 31 drugs.
        # to calculate your team overall score, simply mulitple the score
          of all drugs by the corresponding weight.  Divide the sum of these
          weighted scores by the sum of the weights

    """
    def __init__(self, verbose=True, download=True, **kargs):
        """.. rubric:: constructor

        This challenge uses PERL script that requires specific packages.

        First, you need cpanm tools (http://search.cpan.org/dist/App-cpanminus/)

        Under Fedora 23:

            sudo dnf install perl-App-cpanminus

        Then, install the dependencies that will be required

            sudo cpanm install Math::Libm
            sudo cpanm install Algorithm::Pair::Best2
            sudo cpanm install Digest::SHA1
            sudo cpanm install Tk
            sudo cpanm install Games::Go::AGATourn

        finally install the Games-go-GoPair.tar.gz package stored in dreamtools
        github repositotry in dreamtools/dreamt7/D7C4/misc::

            cd dreamtools/dream7/D7C4/misc
            tar xvfz Games-Go-GoPair-1.001.tar.gz
            cd Games-Go-GoPair-1.001
            perl Makefile.PL
            make
            sudo make install

        """
        super(D7C4, self).__init__('D7C4', verbose, download, **kargs)
        self.sub_challenges = ['A', 'B']

    def _check_subname(self, subname):
        from easydev import check_param_in_list
        check_param_in_list(subname, self.sub_challenges)

[docs]    def download_template(self, subname):
        self._check_subname(subname)
        if subname == 'A':
            filename = self.getpath_template('D7C4_template.csv')
        elif subname == 'B':
            filename = self.getpath_template('D7C4_template_B.csv')
        return filename

[docs]    def score(self, filename, subname):
        self._check_subname(subname)
        if subname == 'A':
            return self.score_A(filename)
        elif subname == 'B':
            return self.score_B(filename)

[docs]    def download_goldstandard(self, subname):
        self._check_subname(subname)
        if subname == 'A':
            filename = self._pj([self.classpath, 'templates',
                    'D7C4_template.csv'])
        elif subname == 'B':
            filename = self.getpath_gs('D7C4_B_synergy_IC20.tsv')
        return filename

[docs]    def score_A(self, filename):
        from easydev import TempFile
        fh = TempFile()
        script = self._pj([self.classpath,
            'weighted_average_concordance_index.pl'])
        datadir = self._pj([self.classpath, 'data'])
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, filename, datadir , fh.name)

        shellcmd(cmd, verbose=True, ignore_errors=True)
        try:
            df = pd.read_csv(fh.name, sep='\t', header=None)
        except:
            print("Something wrong in the Scoring while executing \n  %s. " % cmd)
            print("\n The D7C4 challenge requires a Perl package to be installed")
            print("See D7C4 documentation e.g., on dreamtools.readthedocs.org")
            import sys
            sys.exit(1)
        df.columns = ['DrugID', 'probabilistic c-index',
        'weighted probabilistic c-index', 'zscores']
        df = df.set_index('DrugID')
        fh.delete()

        ws = (df.sum() / df.sum().ix['zscores'])
        ws = ws.ix['weighted probabilistic c-index']

        results = df.mean()
        results['weight average probabilistic c-index'] = ws

        del results['zscores']

        # Finally compute pvalues based on precomputed scores
        precomp = pd.read_csv(self._pj([self.classpath, 'data',
            'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t',
            skiprows=6,  header=None)

        overall_mean = precomp.ix[31][1]
        overall_var = precomp.ix[31][2]

        pval = 1 -  (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1))

        results['weight average probabilistic c-index p-value'] = pval

        return {'Results': results}

[docs]    def score_B(self, filename):
        gs_filename = self.download_goldstandard('B')
        gold = pd.read_csv(gs_filename, sep='\t')
        gold.columns = [x.strip() for x in gold.columns]

        unique_drugs = list(set(gold['Cmpd A']))

        # build a new columns
        pairs = gold['Cmpd A'] + "_" + gold['Cmpd B']
        gold['pairs'] = pairs

        # sort by excess of over Bliss
        try:
            gold = gold.sort_values(by=['Excess over Bliss'])
        except:
            gold = gold.sort(columns=['Excess over Bliss'])


        self.p_matrix = self._probability_matrix(gold['Excess over Bliss'], gold['SEM'])

        # noew read the predicition. Note that here it has to be comma separated
        # There are 91 rows + 1 . THe lsat gives the
        # The 93rd row (i.e., the last row after the header line and the 91 pairs) should report the
        # first compound pair in the ranked list whose predicted activity is not deemed to have a
        # significant synergistic effect, i.e. excess over Bliss close to 0 (see Scoring Metrics section
        # for the definition of excess over Bliss).
        prediction = pd.read_csv(filename, sep=',')
        # we rename first column to agree with gold for a later merge
        prediction.columns = ['pairs', 'Rank']
        #skip last row
        prediction = prediction.ix[0:90]

        #In the submission, drugs already within a single column where drug A and B are seprated by a & sign.
        # let us replace the & by a _ like in the gold standard
        newnames = prediction['pairs'].apply(lambda x : "_".join([y.strip() for y in x.split("&")]))
        newnames = list(newnames)
        prediction['pairs'] = newnames

        # not that the fold is sorted so it should be first
        self.merged = pd.merge(gold, prediction, how='inner', on=['pairs'])

        N = 91
        ranks = self.merged['Rank'].values.astype(float)
        weighted_cindex = self._concordance(ranks, range(0,N), self.p_matrix)

        Nmax = 10000
        cindex_nulldist =np.zeros(Nmax)
        for i in range(0, Nmax):
            # cast to list for py3
            randx = list(range(N))
            random.shuffle(randx) # in place
            cindex_nulldist[i] = self._concordance(randx, range(0,N), self.p_matrix)

        pvalues = sum(cindex_nulldist>= weighted_cindex)/float(Nmax)

        results = pd.Series()
        results['weighted cindex'] = weighted_cindex
        results['pvalue'] = pvalues

        return {'Results': results}

    def _probability_matrix(self, x, x_std):
        from scipy.special import erf
        N = len(x)
        X = np.repeat(np.array(x),N).reshape(N,N)
        X = X - X.transpose()
        X_std = np.repeat(np.array(x_std),N).reshape(N,N)
        X_std = np.sqrt(X_std**2+X_std.transpose()**2)
        p_matrix = 0.5*(1 + erf(X/X_std))
        return p_matrix

    def _concordance(self, x, y, p_matrix):
        N = len(x)
        X = np.repeat(np.array(x),N).reshape(N, N)
        Y = np.repeat(np.array(y),N).reshape(N, N)

        C = np.sign(X - X.transpose()) == np.sign(Y - Y.transpose())
        C = C * (1 - p_matrix.transpose()) + (1-C) * p_matrix.transpose()

        C = sum(sum(np.tril(C, -1))) / float(N) / (N - 1.) * 2
        return C