# -*- python -*-
# -*- coding: utf-8 -*-
#
# This file is part of DREAMTools software
#
# Copyright (c) 2015, DREAMTools Development Team
# All rights reserved
#
# Distributed under the BSD 3-Clause License.
# See accompanying file LICENSE distributed with this software
#
# File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
# website: http://github.com/dreamtools
#
##############################################################################
"""Provides tools related to Receiver Operating Characteristic (ROC).
Those codes were directly translated from Perl or matlab
codes. We should be using scikit-learn in the future."""
import numpy as np
__all__ = ['ROC', 'ROCDiscovery', 'D3D4ROC', 'MCC']
class BinaryClassifier(object):
"""The roc module coontains lots of duplicated code that
will be cleanup in the future. To do so, we will
use this class, which is used e.g., in D6C1.
"""
def __init__(self):
self.recall = []
self.precision = []
self.fpr = []
self.tpr = []
def roc(self):
pass
def compute_auc(self):
"""Compute AUC given a ROC data set (recomputed otherwise)
:param str roc: The roc data structure must be a dictionary with "tpr"
key. Could be a variable returned by :meth:`get_statistics`. If not
provided, compute the ROC.
:return: the AUC
"""
import scipy.integrate
value = scipy.integrate.trapz(self.tpr, x=self.fpr)
return value
def compute_aupr(self):
"""Compute AUPR given a ROC data set (recomputed otherwise)
:param str roc: The roc data structure must be a dictionary with "tpr"
key. Could be a variable returned by :meth:`get_statistics`. If not
provided, compute the ROC.
:return: the AUPR
"""
import scipy.integrate
value = scipy.integrate.trapz(self.precision, x=self.recall)
return value
class ROCBase(object):
"""An ABC class"""
def __init__(self):
pass
def get_statistics(self):
raise NotImplementedError
def compute_auc(self, roc=None):
"""Compute AUC given a ROC data set (recomputed otherwise)
:param str roc: The roc data structure must be a dictionary with "tpr"
key. Could be a variable returned by :meth:`get_statistics`. If not
provided, compute the ROC.
:return: the AUC
"""
if roc is None:
roc = self.get_statistics()
import scipy.integrate
value = scipy.integrate.trapz(roc['tpr'], roc['fpr'])
return value
#return sum(roc['tpr']) / len(roc['tpr'])
def compute_aupr(self, roc=None):
"""Compute AUPR given a ROC data set (recomputed otherwise)
:param str roc: The roc data structure must be a dictionary with "tpr"
key. Could be a variable returned by :meth:`get_statistics`. If not
provided, compute the ROC.
:return: the AUPR
"""
if roc == None:
roc = self.get_statistics()
import scipy.integrate
value = scipy.integrate.trapz(roc['precision'], x=roc['recall'])
return value
[docs]class ROC(ROCBase):
"""A class to compute ROC, AUC and AUPRs for a binary problem
>>> r = ROC() # could provide scores and labels as arguments
>>> r.scores = [0.9,0.8,0.7,.6,.6,.3]
>>> r.classes = [1,0,1,0,1,1]
>>> r.compute_auc()
0.4375
"""
def __init__(self, scores=None, classes=None):
""".. rubric:: Constructor
:param list scores: the scores
:param list classes: binary class made of 1 or 0 numerical values. Also
called labels in the literature.
"""
super(ROC, self).__init__()
self._scores = None
self._classes = None
self.scores = scores
self.classes = classes
def _set_scores(self, values):
self._scores = values
def _get_scores(self):
return self._scores
scores = property(_get_scores, _set_scores, doc="Read/Write the scores")
def _set_classes(self, values):
self._classes = values
def _get_classes(self):
return self._classes
classes = property(_get_classes, _set_classes, doc="Read/Write the classes")
def _compute_fpr_tpr(self, FP, TP, N, P):
roc = {"fpr":[], "tpr":[], "FP":[], "TP":[]}
roc['fpr'] = FP/float(N)
roc['tpr'] = TP/float(P)
roc['TP'] = TP
roc['FP'] = FP
return roc
[docs] def get_roc(self):
"""See :meth:`get_statistics`"""
return self.get_statistics()
[docs] def get_statistics(self):
"""Compute the ROC curve X/Y vectors and some other metrics
:return: a dictionary with different metrics such as FPR (false positive
rate), PTR (true positive rate).
"""
scores = self.scores
classes = list(self.classes)
sorted_indices = np.argsort(1-np.array(scores))
sorted_scores = [scores[i] for i in sorted_indices]
sorted_classes = [classes[i] for i in sorted_indices]
P = float(classes.count(1))
N = float(classes.count(0))
FP = 0
TP = 0
#
roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[],
"recall":[], "accuracy":[], "Fmeasure":[], 'threshold':[]}
fprev = 1.1 # scores are less than 1
# an efficient algorithm to compute ROC based on T.Fawcett Pattern
# Recognition Letters 27, 2006
i = 0
while i<len(scores):
cls = sorted_classes[i]
score = sorted_scores[i]
if score != fprev and cls!=None:
thisroc = self._compute_fpr_tpr(FP, TP, N, P)
for key in ['fpr', 'tpr', "FP", "TP"]:
roc[key].append(thisroc[key])
roc['threshold'].append(fprev)
fprev = score
if cls == 1:
TP += 1
elif cls == 0:
FP += 1
elif cls == None or np.isnan(cls):
pass
else:
raise ValueError("found a class value different from 0,1 or None")
i += 1
sroc = self._compute_fpr_tpr(FP, TP, N, P)
for key in ['fpr', 'tpr', "FP", "TP"]:
roc[key].append(thisroc[key])
# force the last value of tpr/fpr to be one
# TODO this is a hack to bebahve correctly when the final fpr/tpr are nt
# (1,1) as it should be.
roc['fpr'][-1] = 1
roc['tpr'][-1] = 1
roc['threshold'].append(sorted_scores[-1])
roc = self._compute_other_metrics(roc)
return roc
def _compute_other_metrics(self, roc):
#Be aware that there is alway a first value TP=0,FP=0
# this should be handled with care the recall/precision computatio
# This first value is used to compute fpr and tpr but should not be used
# here
FP = np.array([float(x) for x in roc['FP']])
TP = np.array([float(x) for x in roc['TP']])
N = max(FP) + max(TP)
# The first value of precision is undefined since FP+TP=0
# It does not matter since it will be overwritten as explained here
# below
# TODO: to prevent warning, set FP[0] to 100.
FP[0] = 100
precision = TP / (FP + TP)
recall = TP/float(max(TP))
# The second value may also be be an issue. There are 2 cases.
# 1. TP>0
# 2. TP == 0
# In the second case, TP=0 and FP==0 should not happen. FP > 0 instead.
# In which case, recall = 0 and precision=0. No ambiguity
# In the first case, if TP>0 then precision >0 and recall > 0.
# Therefore we end up without a defined value at recall==0
# So we should set a recall=0 for which precision will be the same as
# the one where recall >0
# somehow, precision and recall
if TP[1]>0:
precision[0] = precision[1]
else:
precision[0] = 0
# TODO
#roc['accuracy'] = (TP+FP)/(P+N)
#if roc['recall'] >0 and roc['precision'] != np.nan and roc['precision'] > 0 :
# roc['Fmeasure'] = 2./(1./roc['precision']+1./roc['recall'])
#else:
# roc['Fmeasure'] = np.nan
roc['precision'] = [float(x) for x in precision]
roc['recall'] = [float(x) for x in recall]
return roc
[docs] def plot_roc(self, roc=None):
"""Plot ROC curves
.. plot::
:include-source:
:width: 80%
from dreamtools.core.rocs import ROC
r = ROC()
r.scores = [.9,.5,.6,.7,.1,.2,.6,.4,.7,.9, .2]
r.classes = [1,0,1,0,0,1,1,0,0,1,1]
r.plot_roc()
"""
if roc == None:
roc = self.get_statistics()
from pylab import plot, xlim, ylim ,grid, title, xlabel, ylabel
x = roc['fpr']
plot(x, roc['tpr'], '-o')
plot([0,1], [0,1],'r')
ylim([0, 1])
xlim([0, 1])
grid(True)
title("ROC curve (AUC=%s)" % self.compute_auc(roc))
xlabel("FPR")
ylabel("TPR")
[docs]class ROCDiscovery(ROCBase):
"""A variant of ROC statistics
Used in D5C2 challenge.
.. note:: Does not work if any NA are found.
"""
def __init__(self, discovery):
""".. rubric:: constructor
:param discovery: a list of 0/1 where 1 means positives
"""
super(ROCDiscovery, self).__init__()
try:
self.discovery = discovery.values # a pandas time series ?
except:
self.discovery = np.array(discovery)
# sanity checks
assert set(np.unique(self.discovery)) == set([0,1]),\
'ERROR: discovery is only allowed to have (0,1) entries.'
[docs] def get_statistics(self):
"""Return dictionary with FPR, TPR and other metrics"""
discovery = self.discovery # an alias
T = len(discovery) # Total
P = np.sum(discovery) # positives
N = T - P # negatives
# true positives (false positives) at depth k
TPk = np.cumsum(discovery)
FPk = np.cumsum(1-discovery)
# metrics
TPR = TPk / float(P)
FPR = FPk / float(N)
REC = TPR # recall
PREC = TPk / np.linspace(1, T+1,T) # precision
roc = {"fpr": FPR, "tpr":TPR, "precision":PREC, "FP":[], "TP":[],
"recall": REC, "accuracy":[], "Fmeasure":[], 'threshold':[]}
return roc
[docs] def compute_aupr(self, roc=None):
"""Returns AUPR normalised by (1-1/P) P being number of positives"""
AUPR = super(ROCDiscovery, self).compute_aupr(roc=roc)
P = np.sum(self.discovery)
return AUPR / (1-1./P)
[docs]class D3D4ROC(ROCBase):
def __init__(self):
super(D3D4ROC, self).__init__()
[docs] def plot(self):
metrics = self.metrics
import pylab
fpr = metrics['fpr']
tpr = metrics['tpr']
rec = metrics['rec']
prec = metrics['prec']
pylab.figure(1)
pylab.subplot(1,2,1)
pylab.plot(fpr,tpr)
pylab.title('ROC')
pylab.xlabel('FPR')
pylab.ylabel('TPR')
pylab.subplot(1,2,2)
pylab.plot(rec,prec)
pylab.title('P-R')
pylab.xlabel('Recall')
pylab.ylabel('Precision')
[docs] def get_statistics(self, gold_data, test_data, gold_index):
T = len(gold_data) # Total potential edges n(n-1)
try:
P = gold_data[2].sum() # positives
except:
P = gold_data[1].sum() # positives
N = T - P # negatives
L = len(test_data) # length of prediction list
# counters
k = 0
Ak = 0
TPk = 0
FPk = 0
rec = []
prec = []
tpr=[]
fpr = []
while k < L:
k = k + 1
# index of the kth predicted edge
#idx = test.index(k)
#if gold[idx] == 1:
if gold_index[k-1] == 1:
#%% the edge IS present in the gold standard
#%% increment TPk
TPk = TPk + 1.
#%% update area under precision-recall curve
if k == 1:
delta = 1. / P
else:
delta = (1. - FPk * np.log(k/(k-1.))) / P
Ak = Ak + delta
else: # the edge is NOT present in the gold standard
#%% icrement FPk
FPk = FPk + 1.
# do NOT update area under P-R
#remember
rec.append(TPk/float(P))
prec.append(TPk/float(k))
tpr.append(rec[k-1])
fpr.append(FPk/float(N))
# Done with the positive predictions.
# If the number of predictions (L) is less than the total possible edges (T),
# we assume that they would achieve the accuracy of the null model (random guessing).
TPL = TPk
# rho
if L < T:
rh = (P-TPL)/float(T-L)
else:
rh = 0.
# recall at L
if L > 0:
recL = rec[L-1] # note -1 to use python syntax
else:
recL = 0
# the remaining positives would eventually be predicted
while TPk < P:
k = k + 1
TPk = TPk + 1.
rec.append(TPk/float(P))
if ((rec[k-1]-recL) * P + L * rh) != 0:
prec.append( rh * P * rec[k-1]/((rec[k-1]-recL)*P + L * rh))
else:
prec.append(0)
tpr.append(rec[k-1])
FPk = TPk * (1-prec[k-1])/prec[k-1]
fpr.append(FPk/float(N))
# Now, update the area under the P-R curve
# %% rh = (P-TPk)/(T-L); % BP: I removed this line because it is an error in logic to redefine this here.
AL = Ak
#if ~isnan(rh) and rh != 0 and L != 0:
if rh != 0 and L != 0:
AUC = AL + rh * (1.-recL) + rh * (recL - L * rh / P) * np.log((L * rh + P * (1-recL) )/(L *rh))
elif L == 0:
AUC = P/float(T)
else:
AUC = Ak
# Integrate area under ROC
lc = fpr[0] * tpr[0] /2.
for n in range(1,int(L+P-TPL-1 + 1)):
lc = lc + ( fpr[n] + fpr[n-1]) * (tpr[n]-tpr[n-1]) / 2.
AUROC = 1. - lc
auroc = AUROC
aupr = AUC
return AUC, AUROC, prec, rec, tpr, fpr
class DREAM2(object):
def __init__(self):
pass
def compute_specific_precision_values(self, P, rec):
spec_prec = {}
for x in [1, 2, 5, 20, 100, 500, 1000]:
if x > P:
break
rec0 = x / float(P)
i = rec.index(rec0)
spec_prec[x] = rec[i]
return spec_prec
[docs]def MCC(TP, TN, FP, FN):
"""Matthews correlation coefficient"""
return (TP*TN-FP*FN) / np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))