Source code for dreamtools.dream5.D5C4.scoring

"""D5C4 scoring function

Based on original matlab code from Gustavo A. Stolovitzky and Robert Prill

from dreamtools.core.challenge import Challenge
import pandas as pd
from dreamtools.core.rocs import D3D4ROC
import numpy as np

[docs]class D5C4(Challenge, D3D4ROC): """A class dedicated to D5C4 challenge :: from dreamtools import D5C4 s = D5C4() filename = s.download_template() s.score(filename) Data and templates are downloaded from Synapse. You must have a login. """ def __init__(self, verbose=True, download=True, **kargs): """.. rubric:: constructor """ super(D5C4, self).__init__('D5C4', verbose, download, **kargs) self._init() self.sub_challenges = [] def _init(self): # should download files from synapse if required. if self._standalone is True: return # Get goldstandard and unpack zipped files self._download_data('', 'syn4564722') self.unzip('') # the probabilities self._download_data('', 'syn4564719') self.unzip('') # the templates self._download_data('', 'syn4564726') self.unzip('') def _load_network(self, filename): df = pd.read_csv(filename, header=None, sep='[ \t]', engine='python') df[0] = df[0].apply(lambda x: x.replace('g','').replace('G','')) df[1] = df[1].apply(lambda x: x.replace('g','').replace('G','')) df = df.astype(float) # imoprtant for later to check for equality return df
[docs] def download_template(self): # should return full path to a template file filenames = [] for tag in [1,3,4]: filename = "DREAM5_NetworkInference_myteam_Network%s.txt" % tag filenames.append(self.get_pathname(filename)) return filenames
[docs] def download_goldstandard(self): # should return full path to a gold standard file filenames = [] for tag in [1,3,4]: filename = "DREAM5_NetworkInference_Edges_Network%s.tsv" % tag filenames.append(self.get_pathname(filename)) return filenames
[docs] def score(self, filenames): assert len(filenames) == 3 print("Ther 3 files must be ordered as network 1, 3 and 4") res1 = self.score_challengeA(filenames[0], 1) res3 = self.score_challengeA(filenames[1], 3) res4 = self.score_challengeA(filenames[2], 4) aupr = -np.mean(np.log10([res1['p_aupr'], res3['p_aupr'], res4['p_aupr']])) auroc = -np.mean(np.log10([res1['p_auroc'], res3['p_auroc'], res4['p_auroc']])) print(aupr, auroc, np.mean([aupr, auroc])) df = pd.Series() df['Overall Score'] = np.mean([aupr, auroc]) df['AUPR (pvalue)'] = aupr df['AUROC pvalue)'] = auroc df['Net1 AUPR'] = res1['aupr'] df['Net3 AUPR'] = res3['aupr'] df['Net4 AUPR'] = res4['aupr'] df['Net1 AUROC'] = res1['auroc'] df['Net3 AUROC'] = res3['auroc'] df['Net4 AUROC'] = res4['auroc'] df['Net1 p-aupr'] = res1['p_aupr'] df['Net3 p-aupr'] = res3['p_aupr'] df['Net4 p-aupr'] = res4['p_aupr'] df['Net1 p-auroc'] = res1['p_aupr'] df['Net3 p-auroc'] = res3['p_aupr'] df['Net4 p-auroc'] = res4['p_aupr'] return df
# copy and paste from D5C3 ' FIXME use classes to factorise code
[docs] def score_challengeA(self, filename, tag): """ :param filename: :param tag: :return: """ assert tag in [1,3,4] tag = str(tag) if tag == '1': goldfile = self.download_goldstandard()[0] elif tag == '3': goldfile = self.download_goldstandard()[1] elif tag == '4': goldfile = self.download_goldstandard()[2] # gold standard edges only predictionfile = filename # precomputed probability densities for various metrics pdffile_aupr = self.get_pathname('Network%s_AUPR.mat' % tag) pdffile_auroc = self.get_pathname('Network%s_AUROC.mat'% tag) # load probability densities pdf_aupr = self.loadmat(pdffile_aupr) pdf_auroc = self.loadmat(pdffile_auroc) self.pdf_auroc = self.loadmat(pdffile_auroc) self.pdf_aupr = self.loadmat(pdffile_aupr) # load gold standard self.gold_edges = self._load_network(goldfile) # load predictions self.prediction = self._load_network(predictionfile) # DISCOVERY # In principle we could resuse ROCDiscovery class but # here the pvaluse were also computed. let us do it here for now merged = pd.merge(self.gold_edges, self.prediction, how='inner', on=[0,1]) self.merged = merged TPF = len(merged) # unique species should be 1000 N = len(set(self.gold_edges[0]).union(self.gold_edges[1])) # positive print('Scanning gold standard') # should be 4012, 274380 and 178 on template G = self._get_G(self.gold_edges) # get back the sparse version for later # keep it local to speed up import import scipy.sparse H = scipy.sparse.csr_matrix(G>0) Pos = sum(sum(G > 0)) Neg = sum(sum(G < 0)) Ntot = Pos + Neg # cleanup the prediction that are in the GS self.newpred = self._remove_edges_not_in_gs(self.prediction, G) L = len(self.newpred) discovery = np.zeros(L) X = [tuple(x) for x in self.newpred[[0,1]].values-1] discovery = [H[x] for x in X] TPL = sum(discovery) discovery = np.array([int(x) for x in discovery]) if L < Ntot: p = (Pos - TPL) / float(Ntot - L) else: p = 0 random_positive_discovery = [p] * (Ntot - L) random_negative_discovery = [1-p] * (Ntot - L) # append discovery + random using lists positive_discovery = np.array(list(discovery) + random_positive_discovery) negative_discovery = np.array(list(1-discovery) + random_negative_discovery) # true positives (false positives) at depth k TPk = np.cumsum(positive_discovery) FPk = np.cumsum(negative_discovery) # metrics TPR = TPk / float(Pos) FPR = FPk / float(Neg) REC = TPR # same thing PREC = TPk / range(1,Ntot+1) # sanity check #if ( (P ~= round(TPk(end))) | (N ~= round(FPk(end))) ) # disp('ERROR. There is a problem with the completion of the prediction list.') # end # finishing touch #TPk(end) = round(TPk(end)); #FPk(end) = round(FPk(end)); from dreamtools.core.rocs import ROCBase roc = ROCBase() auroc = roc.compute_auc(roc={'tpr':TPR, 'fpr':FPR}) aupr = roc.compute_aupr(roc={'precision':PREC, 'recall':REC}) # normalise by max possible value aupr /= (1.-1./Pos) p_aupr = self._probability(pdf_aupr['X'][0], pdf_aupr['Y'][0], aupr) p_auroc = self._probability(pdf_auroc['X'][0], pdf_auroc['Y'][0], auroc) results = {'auroc':auroc, 'aupr':aupr, 'p_auroc':p_auroc, 'p_aupr':p_aupr} return results
def _probability(self, X, Y, x): dx = X[1]-X[0] return sum(Y[X >= x])*dx def _remove_edges_not_in_gs(self, prediction, G): regulators = list(set(prediction[0])) targets = list(set(prediction[[0,1]].stack())) N, M = G.shape # for speeding up, let us get the numpy array values data_pred = [tuple(x) for x in prediction[[0,1]].values] count = 0 tokeep = [] for ikeep, row in enumerate(data_pred): i, j = row if (i <= N) and (j <= M): if G[i-1, j-1] != 0: count += 1 tokeep.append(ikeep) return prediction.copy().ix[tokeep] def _get_G(self, gold): from easydev import Progress import scipy.sparse regulators = list(set(gold[0])) targets = list(set(gold[[0,1]].stack())) N, M = gold[0].max(), gold[1].max() ## A will store indices goind from 0 (not 1) to N-1 # hence the -1 indices when handling A if i,j are the # values of the gene A = np.zeros((N, M)) for row in gold[[0,1]].values: i, j = row A[i-1, j-1] = 1 A_sparse = scipy.sparse.csr_matrix(A) #N, M = len(regulators), len(targets) G = np.zeros((N, M)) pb = Progress(len(regulators), 1) for i, x in enumerate(regulators): for j, y in enumerate(targets): if A[x-1, y-1] == 1: G[x-1, y-1] = 1 elif x != y: G[x-1, y-1] = -1 pb.animate(i+1) return G