Source code for dreamtools.core.rocs

# -*- python -*-
# -*- coding: utf-8 -*-
#
#  This file is part of DREAMTools software
#
#  Copyright (c) 2015, DREAMTools Development Team
#  All rights reserved
#
#  Distributed under the BSD 3-Clause License.
#  See accompanying file LICENSE distributed with this software
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  website: http://github.com/dreamtools
#
##############################################################################
"""Provides tools related to Receiver Operating Characteristic (ROC).

Those codes were directly translated from Perl or matlab
codes. We should be using scikit-learn in the future."""
import numpy as np


__all__ = ['ROC', 'ROCDiscovery', 'D3D4ROC', 'MCC']


class BinaryClassifier(object):
    """The roc module coontains lots of duplicated code that
    will be cleanup in the future. To do so, we will 
    use this class, which is used e.g., in D6C1.

    """
    def __init__(self):
        self.recall = []
        self.precision = []
        self.fpr = []
        self.tpr = []

    def roc(self):
        pass

    def compute_auc(self):
        """Compute AUC given a ROC data set (recomputed otherwise)

        :param str roc: The roc data structure must be a dictionary with "tpr"
            key. Could be a variable returned by :meth:`get_statistics`. If not
            provided, compute the ROC.
        :return: the AUC
        """
        import scipy.integrate
        value = scipy.integrate.trapz(self.tpr, x=self.fpr)
        return value

    def compute_aupr(self):
        """Compute AUPR given a ROC data set (recomputed otherwise)

        :param str roc: The roc data structure must be a dictionary with "tpr"
            key. Could be a variable returned by :meth:`get_statistics`. If not
            provided, compute the ROC.
        :return: the AUPR
        """
        import scipy.integrate
        value = scipy.integrate.trapz(self.precision, x=self.recall)
        return value


class ROCBase(object):
    """An ABC class"""

    def __init__(self):
        pass

    def get_statistics(self):
        raise NotImplementedError

    def compute_auc(self, roc=None):
        """Compute AUC given a ROC data set (recomputed otherwise)

        :param str roc: The roc data structure must be a dictionary with "tpr"
            key. Could be a variable returned by :meth:`get_statistics`. If not
            provided, compute the ROC.
        :return: the AUC

        """
        if roc is None:
            roc = self.get_statistics()
        import scipy.integrate
        value = scipy.integrate.trapz(roc['tpr'], roc['fpr'])
        return value
        #return sum(roc['tpr']) / len(roc['tpr'])

    def compute_aupr(self, roc=None):
        """Compute AUPR given a ROC data set (recomputed otherwise)

        :param str roc: The roc data structure must be a dictionary with "tpr"
            key. Could be a variable returned by :meth:`get_statistics`. If not
            provided, compute the ROC.
        :return: the AUPR
        """
        if roc == None:
            roc = self.get_statistics()
        import scipy.integrate
        value = scipy.integrate.trapz(roc['precision'], x=roc['recall'])
        return value


[docs]class ROC(ROCBase):
    """A class to compute ROC, AUC and AUPRs for a binary problem


        >>> r = ROC() # could provide scores and labels as arguments
        >>> r.scores = [0.9,0.8,0.7,.6,.6,.3]
        >>> r.classes = [1,0,1,0,1,1]
        >>> r.compute_auc()
        0.4375


    """
    def __init__(self, scores=None, classes=None):
        """.. rubric:: Constructor

        :param list scores: the scores
        :param list classes: binary class made of 1 or 0 numerical values. Also
            called labels in the literature.
        """
        super(ROC, self).__init__()
        self._scores = None
        self._classes = None

        self.scores = scores
        self.classes = classes

    def _set_scores(self, values):
        self._scores = values
    def _get_scores(self):
        return self._scores
    scores = property(_get_scores, _set_scores, doc="Read/Write the scores")

    def _set_classes(self, values):
        self._classes = values
    def _get_classes(self):
        return self._classes
    classes = property(_get_classes, _set_classes, doc="Read/Write the classes")

    def _compute_fpr_tpr(self, FP, TP, N, P):
        roc = {"fpr":[], "tpr":[], "FP":[], "TP":[]}
        roc['fpr'] = FP/float(N)
        roc['tpr'] = TP/float(P)
        roc['TP'] = TP
        roc['FP'] = FP
        return roc

[docs]    def get_roc(self):
        """See :meth:`get_statistics`"""
        return self.get_statistics()

[docs]    def get_statistics(self):
        """Compute the ROC curve X/Y vectors and some other metrics

        :return: a dictionary with different metrics such as FPR (false positive
            rate), PTR (true positive rate).

        """
        scores = self.scores
        classes = list(self.classes)

        sorted_indices = np.argsort(1-np.array(scores))
        sorted_scores = [scores[i] for i in sorted_indices]
        sorted_classes = [classes[i] for i in sorted_indices]
        P = float(classes.count(1))
        N = float(classes.count(0))
        FP = 0
        TP = 0
        #
        roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[],
            "recall":[], "accuracy":[], "Fmeasure":[], 'threshold':[]}
        fprev = 1.1 # scores are less than 1 

        # an efficient algorithm to compute ROC based on T.Fawcett Pattern
        # Recognition Letters 27, 2006
        i = 0
        while i<len(scores):
            cls = sorted_classes[i]
            score = sorted_scores[i]

            if score != fprev and cls!=None:
                thisroc = self._compute_fpr_tpr(FP, TP, N, P)
                for key in ['fpr', 'tpr', "FP", "TP"]:
                    roc[key].append(thisroc[key])
                roc['threshold'].append(fprev)
                fprev = score

            if cls == 1:
                TP += 1
            elif cls == 0:
                FP += 1
            elif cls == None or np.isnan(cls):
                pass
            else:
                raise ValueError("found a class value different from 0,1 or None")
            i += 1
        sroc = self._compute_fpr_tpr(FP, TP, N, P)
        for key in ['fpr', 'tpr', "FP", "TP"]:
            roc[key].append(thisroc[key])

        # force the last value of tpr/fpr to be one
        # TODO this is a hack to bebahve correctly when the final fpr/tpr are nt
        # (1,1) as it should be.
        roc['fpr'][-1] = 1
        roc['tpr'][-1] = 1

        roc['threshold'].append(sorted_scores[-1])

        roc = self._compute_other_metrics(roc)
        return roc

    def _compute_other_metrics(self, roc):
        #Be aware that there is alway a first value TP=0,FP=0
        # this should be handled with care the recall/precision computatio
        # This first value is used to compute fpr and tpr but should not be used 
        # here
        FP = np.array([float(x) for x in roc['FP']])
        TP = np.array([float(x) for x in roc['TP']])
        N = max(FP) + max(TP)

        # The first value of precision is undefined since FP+TP=0
        # It does not matter since it will be overwritten as explained here
        # below

        # TODO: to prevent warning, set FP[0] to 100.
        FP[0] = 100
        precision = TP / (FP + TP)
        recall = TP/float(max(TP))
        # The second value may also be be an issue. There are 2 cases.
        # 1. TP>0
        # 2. TP == 0 
        # In the second case,  TP=0 and FP==0 should not happen. FP > 0 instead.
        # In which case, recall = 0 and precision=0. No ambiguity
        # In the first case, if TP>0 then precision >0 and recall > 0. 
        # Therefore we end up without a defined value at recall==0
        # So we should set a recall=0 for which precision will be the same as
        # the one where recall >0
        # somehow, precision and recall 
        if TP[1]>0:
            precision[0] = precision[1] 
        else:
            precision[0] = 0

        # TODO
        #roc['accuracy'] = (TP+FP)/(P+N)
        #if roc['recall'] >0 and roc['precision'] != np.nan and roc['precision'] > 0 :
        #    roc['Fmeasure'] = 2./(1./roc['precision']+1./roc['recall'])
        #else:
        #    roc['Fmeasure'] = np.nan
        roc['precision'] = [float(x) for x in precision]
        roc['recall'] = [float(x) for x in recall]
        return roc

[docs]    def plot_roc(self, roc=None):
        """Plot ROC curves

        .. plot::
            :include-source:
            :width: 80%

            from dreamtools.core.rocs import ROC
            r = ROC()
            r.scores = [.9,.5,.6,.7,.1,.2,.6,.4,.7,.9, .2]
            r.classes = [1,0,1,0,0,1,1,0,0,1,1]
            r.plot_roc()

        """
        if roc == None:
            roc = self.get_statistics()
        from pylab import plot, xlim, ylim ,grid, title, xlabel, ylabel
        x = roc['fpr']
        plot(x, roc['tpr'], '-o')
        plot([0,1], [0,1],'r')
        ylim([0, 1])
        xlim([0, 1])
        grid(True)
        title("ROC curve (AUC=%s)" % self.compute_auc(roc))
        xlabel("FPR")
        ylabel("TPR")


[docs]class ROCDiscovery(ROCBase):
    """A variant of ROC statistics

    Used in D5C2 challenge.

    .. note:: Does not work if any NA are found.
    """
    def __init__(self, discovery):
        """.. rubric:: constructor


        :param discovery: a list of 0/1 where 1 means positives
        """
        super(ROCDiscovery, self).__init__()

        try:
            self.discovery = discovery.values # a pandas time series ?
        except:
            self.discovery = np.array(discovery)

        # sanity checks
        assert set(np.unique(self.discovery)) == set([0,1]),\
            'ERROR: discovery is only allowed to have (0,1) entries.'

[docs]    def get_statistics(self):
        """Return dictionary with FPR, TPR and other metrics"""
        discovery = self.discovery # an alias
        T = len(discovery)  # Total
        P = np.sum(discovery) # positives
        N = T - P # negatives

        # true positives (false positives) at depth k
        TPk = np.cumsum(discovery)
        FPk = np.cumsum(1-discovery)

        # metrics
        TPR = TPk / float(P)
        FPR = FPk / float(N)
        REC = TPR  # recall
        PREC = TPk / np.linspace(1, T+1,T) # precision

        roc = {"fpr": FPR, "tpr":TPR, "precision":PREC, "FP":[], "TP":[],
            "recall": REC, "accuracy":[], "Fmeasure":[], 'threshold':[]}
        return roc

[docs]    def compute_aupr(self, roc=None):
        """Returns AUPR normalised by (1-1/P) P being number of positives"""
        AUPR = super(ROCDiscovery, self).compute_aupr(roc=roc)
        P = np.sum(self.discovery)
        return AUPR / (1-1./P)


[docs]class D3D4ROC(ROCBase):
    def __init__(self):
        super(D3D4ROC, self).__init__()

[docs]    def plot(self):
        metrics = self.metrics
        import pylab
        fpr = metrics['fpr']
        tpr = metrics['tpr']
        rec = metrics['rec']
        prec = metrics['prec']

        pylab.figure(1)
        pylab.subplot(1,2,1)
        pylab.plot(fpr,tpr)
        pylab.title('ROC')
        pylab.xlabel('FPR')
        pylab.ylabel('TPR')
        pylab.subplot(1,2,2)
        pylab.plot(rec,prec)
        pylab.title('P-R')
        pylab.xlabel('Recall')
        pylab.ylabel('Precision')

[docs]    def get_statistics(self, gold_data, test_data, gold_index):

        T = len(gold_data)          # Total potential edges n(n-1)
        try:
            P = gold_data[2].sum()      # positives
        except:
            P = gold_data[1].sum()      # positives
        N = T - P                   # negatives
        L = len(test_data)          # length of prediction list

        # counters
        k = 0
        Ak = 0
        TPk = 0
        FPk = 0

        rec = []
        prec = []
        tpr=[]
        fpr = []
        while k < L:

            k = k + 1
            # index of the kth predicted edge
            #idx = test.index(k)
            #if gold[idx] == 1:
            if gold_index[k-1] == 1:
                #%% the edge IS present in the gold standard

                #%% increment TPk
                TPk = TPk + 1.

                #%% update area under precision-recall curve
                if k == 1:
                    delta = 1. / P
                else:
                    delta = (1. - FPk * np.log(k/(k-1.))) / P
                Ak = Ak + delta

            else: # the edge is NOT present in the gold standard
                #%% icrement FPk
                FPk = FPk + 1.

            # do NOT update area under P-R
            #remember
            rec.append(TPk/float(P))
            prec.append(TPk/float(k))
            tpr.append(rec[k-1])
            fpr.append(FPk/float(N))

        # Done with the positive predictions.

        # If the number of predictions (L) is less than the total possible edges (T),
        # we assume that they would achieve the accuracy of the null model (random guessing).

        TPL = TPk

        # rho
        if L < T:
            rh = (P-TPL)/float(T-L)
        else:
            rh = 0.

        # recall at L
        if L > 0:
            recL = rec[L-1]   # note -1 to use python syntax
        else:
            recL = 0

        # the remaining positives would eventually be predicted
        while TPk < P:
            k = k + 1
            TPk = TPk + 1.
            rec.append(TPk/float(P))
            if ((rec[k-1]-recL) * P + L * rh) != 0:
                prec.append( rh * P * rec[k-1]/((rec[k-1]-recL)*P + L * rh))
            else:
                prec.append(0)

            tpr.append(rec[k-1])
            FPk = TPk * (1-prec[k-1])/prec[k-1]
            fpr.append(FPk/float(N))

        # Now, update the area under the P-R curve
        # %% rh = (P-TPk)/(T-L);  % BP: I removed this line because it is an error in logic to redefine this here.
        AL = Ak

        #if ~isnan(rh) and rh != 0 and L != 0:
        if rh != 0 and L != 0:
            AUC = AL + rh * (1.-recL) + rh * (recL - L * rh / P) * np.log((L * rh + P * (1-recL) )/(L *rh))
        elif L == 0:
            AUC = P/float(T)
        else:
            AUC = Ak

        # Integrate area under ROC
        lc = fpr[0] * tpr[0] /2.
        for n in range(1,int(L+P-TPL-1 + 1)):
            lc = lc + ( fpr[n] + fpr[n-1]) * (tpr[n]-tpr[n-1]) / 2.

        AUROC = 1. - lc

        auroc = AUROC
        aupr = AUC

        return AUC, AUROC, prec, rec, tpr, fpr


class DREAM2(object):

    def __init__(self):
        pass

    def compute_specific_precision_values(self, P, rec):

        spec_prec = {}
        for x in [1, 2, 5, 20, 100, 500, 1000]:
            if x > P:
                break
            rec0 = x / float(P)
            i = rec.index(rec0)
            spec_prec[x] = rec[i]

        return spec_prec


[docs]def MCC(TP, TN, FP, FN):
    """Matthews correlation coefficient"""
    return (TP*TN-FP*FN) / np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))