Source code for dreamtools.dream8.D8C1.scoring

#-*- python -*-
# -*- coding: utf-8 -*-
#
#  This file is part of DREAMTools software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: http://github.com/dreamtools
#
##############################################################################
"""This module provides utilities to compute scores related to HPN-Dream8

It can be used and should be used indepently of Synapse altough for testing,
data sets may be downloading from synapse if you don't have any local files to
play with.

Here is an example related to the Network subchallenge::

    >>> from dreamtools.dream8.D8C1 import scoring
    >>> s = scoring.HPNScoringNetwork()
    >>> s.compute_all_descendant_matrices()
    >>> s.compute_all_rocs()
    >>> s.compute_all_aucs()


https://www.synapse.org/#!Synapse:syn1720047/wiki/60530



"""
import csv
import os
import copy
import json
import zipfile

import pylab
import numpy as np

from dreamtools.core import rocs
from dreamtools.dream8.D8C1 import cython_scoring
from dreamtools.core.ziptools import ZIP
from dreamtools.core.rocs import ROC
from dreamtools.core.challenge import Challenge, LocalData
from dreamtools.dream8.D8C1 import commons


__all__ = ["HPNScoringNetwork", "HPNScoring", "HPNScoringNetworkInsilico",
    "HPNScoringNetwork_ranking", "HPNScoringPrediction",
    "HPNScoringPrediction_ranking", "ScoringError",
    "HPNScoringPredictionInsilico_ranking",
    "HPNScoringPredictionInsilico", "D8C1"]


[docs]class D8C1(Challenge): """Factory for the D8C1 (HPN-Breast challenge)""" def __init__(self, version=2, verbose=True, download=True, **kargs): super(D8C1, self).__init__('D8C1', verbose, download, **kargs) self.sub_challenges = ['SC1A', 'SC1B', 'SC2A', 'SC2B'] self._init() self.version = version # download (_init) only on request so that --info and --onweb # can run without connection to synapse def _init(self): if self._standalone is True: return self._download_data('experimental.zip', 'syn1920412') self._download_data('alphabeta-Network.zip', 'syn4939071') self._download_data('alphabeta-Network-Insilico.zip', 'syn4939076') self._download_data('alphabeta-Prediction.zip', 'syn4939077') self._download_data('alphabeta-Prediction-Insilico.zip', 'syn4939078') self._download_data('TruePrediction.zip', 'syn4939080') self._download_data('TruePredictionInsilico.zip', 'syn4939081') self._download_data('TruePredictionInsilico2.zip', 'syn4939079') self._download_data('TrueDescVectors.zip', 'syn4939082')
[docs] def download_template(self, subname): if subname == 'SC1A': filename = self.get_pathname('alphabeta-Network.zip') elif subname == 'SC1B': filename = self.get_pathname('alphabeta-Network-Insilico.zip') elif subname == 'SC2A': filename = self.get_pathname('alphabeta-Prediction.zip') elif subname == 'SC2B': filename = self.get_pathname('alphabeta-Prediction-Insilico.zip') else: raise ValueError('Invalid name. Use one of %s' % self.sub_challenges) return filename
[docs] def download_goldstandard(self, subname): if subname == 'SC1A': filename = self.get_pathname('TrueDescVectors.zip') elif subname == 'SC1B': filename = self.getpath_gs("TrueGraph.csv") elif subname == 'SC2A': filename = self.get_pathname('TruePrediction.zip') elif subname == 'SC2B': filename = self.get_pathname('TruePredictionInsilico2.zip') else: raise ValueError('Invalid name. Use one of %s' % self.sub_challenges) return filename
[docs] def score(self, filename, subname=None): self._init() if subname == 'SC1A': s = HPNScoringNetwork(filename) s.compute_all_aucs() auc = s.get_auc_final_scoring() return {'meanAUROC': auc} elif subname == 'SC1B': s = HPNScoringNetworkInsilico(filename) s.compute_score() return {'meanAUROC': s.auc} elif subname == 'SC2A': s = HPNScoringPrediction(filename, version=self.version) s.compute_all_rmse() return {'meanRMSE': s.get_mean_rmse()} elif subname == 'SC2B': s = HPNScoringPredictionInsilico(filename, version=self.version) s.compute_all_rmse() return {'meanRMSE': s.get_mean_rmse()} else: raise ValueError('Invalid name. Use one of %s. You provided %s.' % (self.sub_challenges, subname))
[docs]class ScoringError(Exception): """An exception class for scoring classes""" def __init__(self, value): self.value = value def __str__(self): return repr("ScoringError(HPN-DREAM8): %s " % self.value )
[docs]class HPNScoring( ZIP, LocalData): """Base class common to all scoring classes The HPN challenges use data from 32 types of combinaison of cell lines (4) and ligands (8). This class provides aliases to: * valid cell lines (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_cellLines`) * valid ligands (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_ligands`) * expected length of the vectors for each cell line (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_length`) * indices of the row vectors containing the mTOR species within the descendancy matrices (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.mTor_index`) .. note:: all matrices and vectors are sorted according to a hard-coded list of species for a combinaison of cell line and ligand. The species are indeed sorted alphabetically following hhe same order as in the original CSV files containing the data sets. In addition, it the :attr:`score` attributes can be used to store the score computed by :meth:`compute_score` . All classes that need to compute scores require a data file submitted by a participant. We enforce the usage of ZIP file, which can be loaded by using :meth:`loadZIPFile`. """ def __init__(self, verbose=True): super(HPNScoring, self).__init__() LocalData.__init__(self) #: List of valid cell lines (e.g, BT20) self.valid_cellLines = commons.cellLines #: List of valid ligands (e.g, EGF) self.valid_ligands = commons.stimuli #: length of the vectors to be found within each cell line # ignoring TAZ_p89 and FOXO3a_pS318 # sometimes, like in the true descendants, the TAZ and FOX are inlcude # but set to None, which may be confusing self.valid_length = {'BT549': 44, 'MCF7': 39, 'UACC812': 44, 'BT20': 46} self.valid_length_extended = {'BT549': 45, 'MCF7': 41, 'UACC812': 45, 'BT20': 48} #: indices of the mTOR species in the different cell lines within # the true descendants vectors assuming length are 44, 39, 44, 46 that # is excludnig the taz and fox phosphos self.mTor_index = {'BT20': 23, 'UACC812': 21, 'MCF7':21, 'BT549':21} #indices are computed using: self.species_to_ignore = { 'MCF7': ['TAZ_pS89', 'FOXO3a_pS318_S321'], 'BT20': ['TAZ_pS89', 'FOXO3a_pS318_S321'], 'BT549': ['TAZ_pS89'], 'UACC812': ['TAZ_pS89'] } # We need to retrieve list of phosphos as coded in the # experimental data sets . self.cellLines_names_steven = ['UACC812', 'BT549', 'MCF7', 'BT20'] self.ligands_names_steven = ['Serum', 'PBS', 'EGF', 'Insulin', 'FGF1', 'HGF', 'NRG1', 'IGF1'] self._score = None self.verbose = verbose def _get_score(self): return self._score def _set_score(self, value): assert value>=0 and value<=1 self._score = value score = property(_get_score, _set_score, doc="R/W attribute to store the score (in [0,1] only)")
[docs] def error(self, message): """If you want to raise an error, use this method. It raises a ScoringException and set the :attr:`exception` attribute. The message is stored in exception.value If called, the :attr:` ` is set to "INVALID" and the score is set to 1 (worst score). """ print("ERROR:"+ message) ScoringError(message)
[docs] def load_species(self): """Loads names of the expected phospho names for each cell line from the synapse files provided to the users""" # need to refactor that... c = Challenge('D8C1') filename = c.get_pathname('experimental.zip') z = zipfile.ZipFile(filename) self.species = {} for cell in self.valid_cellLines: #print("experimental/MIDAS/MD_%s_main.csv" % cell) ## in py3, z.read returns a bytes hence the str cast thisdata = z.read("experimental/MIDAS/MD_%s_main.csv" % cell).decode() header = thisdata.splitlines()[0] phosphos = [x.split(":")[1] for x in header.split(",") if x.startswith("DV")] phosphos = [x for x in phosphos if x not in self.species_to_ignore[cell]] assert len(phosphos) == self.valid_length[cell], "wrong length (%s) for cell %s " % (len(phosphos), cell) self.species[cell] = phosphos
class HPNScoringNetworkBase(HPNScoring): test_synapse_id = "syn1971273" true_synapse_id = "syn1971278" def __init__(self, filename=None, verbose=True): HPNScoring.__init__(self, verbose=verbose) self.filename = filename self.load_species() def _validate_eda(self, filename): if self.verbose: print("Validating %s" % filename), f = self.zip_data.open(filename) reader = csv.reader(f) reader.next() # skip the header f.close() if self.verbose: print("ok") def _str_cleanup(self, text): # required to cleanup some submissions text = text.replace("\r", "\n") text = text.replace("\n\n", "\n") text.strip() return text def get_eda_file(self, filename): if self.verbose:print("Loading EDA scores from %s" % filename) # read the data and stores it into vectors #index = self.zip_filenames.index(filename) #tail = os.path.split(self.zip_filenames[index])[-1] zipdata = self.zip_data.read(filename).decode() # prevent issue with NULL byte from Boston team data_iter = [x.replace("\x00", "") for x in zipdata.splitlines()] data_iter = [x.replace("\t", " ") for x in data_iter] data_iter = [self._str_cleanup(x) for x in data_iter if len(x)] if len(data_iter) == 1 and "\n" in data_iter[0]: data_iter = data_iter[0].split("\n") # skip header: if "EdgeScore" in data_iter[0]: del data_iter[0] data = [] for i, datum in enumerate(data_iter): datum = datum.replace("=", " = ").replace("("," (").replace(")", ") ") datum = datum.split() if len(datum) == 5: data.append(datum) else: ValueError("EDA file (%s) should be made of 5 columns" % filename) node1 = [x[0] for x in data] node2 = [x[2] for x in data] edges = [x[1] for x in data] scores = [x[4] for x in data] return node1, node2, edges, scores
[docs]class HPNScoringNetwork(HPNScoringNetworkBase): """Class to compute the score of a Network submission A user will provide a ZIP file that contains 65 files: 32 EDA, The 32 files should be tagged with the 32 combos of cell lines and ligands. To create an instance of HPNScoringNetwork, type:: s = HPNScoringNetwork("TeamName-Network.zip") # or later s = HPNScoringNetwork() s.load_submission("TeamName-Network.zip") s.get_auc_final_scoring() # as in the challenge ignoring some regimes You then need to specifically load the EDA files. This may be done with :meth:`load_all_eda_files_from_zip`:: s.load_all_eda_files_from_zip() The content of the ZIP file can be validated using the :meth:`validation` method.:: s.validation() Each EDA and SIF file must be a complete graph where all species correspond to the CSV files provided on the synapse web page. The size of the network varies depending on the cell line. Each EDA file that contains score on each edge and first needs to be transformed into a descendancy matrix. This is achieve via :meth:`compute_descendant_matrix` and/or :meth:`compute_descendant_matrix` methods.:: s.compute_all_descendant_matrices() From each matrix, we'd like to compare a specific row (corresponding to mTOR) to the true scores that are expected. The true descendant for each combinaison of cell line and ligand are provided and loaded in the constructor via :meth:`load_true_descendants_from_zip`, which can be called at any time. """ test_synapse_id = "syn1971273" true_synapse_id = "syn1971278" def __init__(self, filename=None, verbose=False, true_descendants=None): """ :param str filename: """ super(HPNScoringNetwork, self).__init__(filename, verbose=verbose) self._init() if filename: self.load_submission(self.filename) self.robustness_testing = False self.masking = 0.1 if true_descendants is None: self._load_true_descendants_from_zip() else: self.true_descendants = copy.deepcopy(true_descendants)
[docs] def load_submission(self, filename): self.loadZIPFile(filename) self.load_all_eda_files_from_zip()
def _init(self): """Creates attributes to store edge scores from 32 EDA files""" self.edge_scores = dict([(x,{}) for x in self.valid_cellLines]) #self.species = dict([(x,{}) for x in self.valid_cellLines]) self.descendancy_matrices = dict([(x,{}) for x in self.valid_cellLines]) self.roc = dict([(x,{}) for x in self.valid_cellLines]) self.auc = dict([(x,{}) for x in self.valid_cellLines]) self.aupr = dict([(x,{}) for x in self.valid_cellLines])
[docs] def edge_score_to_eda_files(self, teamname): directory = teamname try:os.mkdir(directory) except:pass for k1 in self.edge_scores.keys(): species = self.species[k1] N = len(species) for k2 in self.edge_scores[k1].keys(): filename = teamname + "-" + k1 + "-" + k2 + "-Network.eda" fh = open(directory + os.sep + filename, "w") assert self.edge_scores[k1][k2].shape == (N, N) for i in range(0, N): for j in range(0, N): name1 = species[i] name2 = species[j] value = self.edge_scores[k1][k2][i][j] fh.write("{} (1) {} = {}\n".format(name1, name2,value)) fh.close()
[docs] def validation(self): """General validation * Check that there are 32 EDA files * For each EDA file, calls further check * format of the filename (correct cell line and ligand names) * format of the dataa = character with a RHS and LHS * LHS is made of 3 elements * skip the header """ # check that the number of files are correct. if self.verbose:print("Checking number of files and extensions..."), extensions = [os.path.splitext(x)[1] for x in self.zip_filenames if "MACOSX" not in x] # check that the contents of the SIF are 3 columns if self.verbose: print("ok\nChecking filename and contents of SIF files"), # check that the contents of the EDA files are correct. if self.verbose: print("All SIF files seem ok\nChecking filename and contents of EDA files"), for filename in self.zip_filenames: if filename.endswith(".eda"): self._check_filename(filename) try: self._validate_eda(filename) except: pass if self.verbose:print("All EDA files seem ok")
def _check_filename(self, filename): filename = os.path.split(filename)[1] if len(filename)==0: return try: team, cellLine, ligand, name = filename.split("-") if cellLine not in self.valid_cellLines: self.error(" * filename %s ill-formed. unrecognised Cell Line. must be in %s " % filename, self.valid_cellLines ) if ligand not in self.valid_ligands: self.error(" * filename %s ill-formed. unrecognised ligand. must be in %s " % filename, self.valid_ligands) except ValueError: self.error(" * filename %s is ill formed. Expects TeamName-CellLine-Ligand-Network.eda" % filename)
[docs] def compute_score(self, validation=True): """Computes the final score that is the average over the 32 AUCs This function compute the final score. First, il loads all EDA files provided by the participany (from the ZIP file). Then, it computes the 32 descendant matrices. Finally, it computes the 32 ROCS and AUCS. The scores is for now based on the z-score. Since scores must be between 0 and 1, where 0 is the best, we will need to normalise. :param bool validation: perform validation of the input ZIP file """ # First let us load all EDA edge scores into 32 matrices in edge_scores # together with the correponsind species self.compute_all_aucs() if len(self.auc['BT20']): self.score = self.get_average_auc()
[docs] def get_auc_final_scoring(self): """This function returns the mean AUC using only official ligands as used in final scoring and collaborative rounds. The individual AUCs must be computed first with :meth:`compute_all_aucs` or .:: >>> s = scoring.HPNScoringNetwork(filename) >>> auc = s.get_auc_final_scoring() """ self.compute_all_aucs() ligands = {'BT20': ['IGF1', 'PBS', 'Serum', 'NRG1', 'HGF', 'EGF', 'FGF1'], 'BT549': ['IGF1', 'PBS', 'Serum', 'Insulin', 'HGF', 'EGF', 'FGF1'], 'MCF7': ['IGF1', 'PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1'], 'UACC812': ['PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1', 'IGF1']} auc = np.mean([self.auc[k1][k2] for k1 in self.auc.keys() for k2 in self.auc[k1].keys() if k2 in ligands[k1]]) return auc
def _load_true_descendants_from_zip(self): """Reads true descendants provided by Steven 27 June 2013 The ZIP file should contains 32 vectors that contains the score for the edge positive (1) or negative (0) This method stores the data in the :attr:`true_descendants`. """ self.true_descendants = dict([(x,{}) for x in self.valid_cellLines]) c = Challenge('D8C1') filename = c.get_pathname("TrueDescVectors.zip") zipdata = zipfile.ZipFile(filename) if self.verbose: print("Loading all true descendants data set (%s)" % filename) for filename in zipdata.namelist(): try: teamName, cellLine, ligand = os.path.splitext(filename)[0].split("_") data = zipdata.read(filename).decode() data = data.splitlines()[0] # look at the header data = [int(x) if x!="NaN" else None for x in data.strip().split(',')] self.true_descendants[cellLine][ligand] = data[:] except Exception as err: print("note: skipping %s in your zipped file" % filename) pass
[docs] def load_all_eda_files_from_zip(self): """Loads all EDA file from a participant into :attr:`edge_scores`""" for filename in self.zip_filenames: if filename.endswith(".eda") and "__MACOS" not in filename: self.load_eda_file(filename) # Issue in github https://github.com/dreamtools/dreamtools/issues/16 M = max([self.edge_scores[cell][l].max() for cell in self.cellLines_names_steven for l in self.ligands_names_steven]) for cell in self.cellLines_names_steven: for lig in self.ligands_names_steven: self.edge_scores[cell][lig] /= float(M)
[docs] def load_eda_file(self, filename, local=False): """Loads scores from one EDA file :param filename: here filename should be one of the filename to be found within the ZIP file! This is not a standard file system (See note). Input data is EDA format that is:: A 1 B = 0.4 A 1 C = 0.5 It containts edges such that the final graph is complete and a matrix can be built with column1 as the rows and column2 as the columns. The values being tmade from the fifth column. Second and fourth are ignored. loaded data is stored in :attr:`data` as a numpy matrix. .. note:: to overwrite the input ZIP file, use :meth:`loadZIPFile` """ if self.verbose:print("Loading EDA scores from %s" % filename), if local == True: tail = os.path.split(filename)[-1] data = open(filename) data_iter = csv.reader(data, delimiter=" ") else: # read the data and stores it into vectors index = self.zip_filenames.index(filename) tail = os.path.split(self.zip_filenames[index])[-1] zipdata = self.zip_data.read(filename).decode() data_iter = zipdata.splitlines() if len(data_iter) and "EdgeScore" in data_iter[0]: del data_iter[0] try: teamName, cellLine, ligand, tag = tail.split("-") except: self.error(" * filename %s is ill formed. Expects TeamName-CellLine-Ligand-Network.eda" % filename) return # skip first row #header = data_iter.next() data = [] for i, datum in enumerate(data_iter): datum = datum.replace("=", " = ").replace("("," (").replace(")", ") ") datum = datum.split() if len(datum) == 5: data.append(datum) elif len(datum) == 0 or len(datum) == 1: pass #print("Found empty line in EDA file %s" % filename) else: self.error("- Line %s in %s is ill formed (expected 3 or 5 columns): %s" % (i,filename, datum)) break node1 = [x[0] for x in data] node2 = [x[2] for x in data] #edges = [x[1] for x in data] scores = [x[4] for x in data] N = self.valid_length[cellLine] species = self.species[cellLine] tempdata = np.zeros((N, N)) for x,y,z in zip(node1, node2, scores): if x in self.species_to_ignore[cellLine] or y in self.species_to_ignore[cellLine]: pass else: #update from july 16 . error in MIDAS and CSV PKC-delta should # have been PKC-delta_pS664 so some people use one or the other # similarly for PRKCD if x == "PKC-delta_pS664": x = "PKC-delta" mapping = {'c-RAF_pS338': "c-Raf_pS338", "YB-1_pS102": "YB-1_PS102", "MEK1_pS217_pS221": "MEK1_pS217_S221", "EGFR_pY922": "EGFR_pY992" } for k,v in mapping.items(): if x == k: print("Warning mapping required in the EDA") x = v if y == k: y = v if "SRC_" in x: x = x.replace("SRC_", "Src_") if "SRC_" in y: y = y.replace("SRC_", "Src_") if y == "PKC-delta_pS664": y = "PKC-delta" if "RB_" in x: x = x.replace("RB_", "Rb_") if "RB_" in y: y = y.replace("RB_", "Rb_") try: i = species.index(x) except: self.error("Invalid species(%s) found in the file %s " % (x, filename)) break try: j = species.index(y) except: self.error("Invalid species(%s) found in the file %s " % (y, filename)) break tempdata[i][j] = z self.edge_scores[cellLine][ligand] = abs(tempdata) M = self.edge_scores[cellLine][ligand].max()
#if M>1: # print("!!!!!!!!!!!!!!!!!!!!1, %s %s" % (cellLine, ligand)) # self.edge_scores[cellLine][ligand] /= float(M) # max is now computed across all networks # https://github.com/dreamtools/dreamtools/issues/16 # self.edge_scores[cellLine][ligand] /= float(M)
[docs] def compute_all_descendant_matrices(self): """Compute all descendancy matrices For each cell line and ligand, the matrix is stored in the :attr:`edge_scores` dictionary. .. seealso:: :meth:`compute_descendant_matrix` """ for c in self.valid_cellLines: for l in self.valid_ligands: if self.verbose:print("Computing descendancy matrix for (%s, %s)" % (c,l)) self.compute_descendant_matrix(c,l)
[docs] def compute_descendant_matrix(self, cellLine, ligand): """Computes the descendancy matrix for a given cell line and ligand :param str cellLine: a valid cell line :param str ligand: a valid ligand .. note:: we use a cython module to conmpute the matrix. This function is the bottle neck of the entire procedure to compute the score. This is especially important to estimate te null distribution of the AUCs. Using Cython does not improve much the performance (80%) but it improves it... .. seealso:: :meth:`compute_all_descendant_matrices` """ cython_scoring.compute_descendant_matrix(self, cellLine, ligand)
[docs] def compute_metrics(self, cellLine, ligand): scores = self._get_scores(cellLine, ligand) classes = self._get_classes(cellLine, ligand) classes = [x for x in classes if x!=None] sorted_indices = np.argsort(1-scores) sorted_scores = [scores[i] for i in sorted_indices] sorted_classes = [classes[i] for i in sorted_indices] P = float(classes.count(1)) N = float(classes.count(0)) roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[], "recall":[], "Fmeasure":[], 'threshold':[]} for i in range(0, len(scores)): threshold = sorted_scores[i] TP = float(sorted_classes[0:i].count(1)) FP = float(sorted_classes[0:i].count(0)) TPR = TP/P FPR = FP/N precision = TP / (TP+FP) recall = TP / P if recall >0 and precision >0 : Fmeasure = 2./(1./precision+1./recall) else: Fmeasure = np.nan roc["threshold"].append(threshold) roc["FP"].append(FP) roc["TP"].append(TP) roc['tpr'].append(TPR) roc['fpr'].append(FPR) roc['precision'].append(precision) roc['recall'].append(recall) roc['Fmeasure'].append(Fmeasure) return roc
def _compute_fpr_tpr(self, FP, TP, N, P): roc = {"fpr":[], "tpr":[], "FP":[], "TP":[]} roc['fpr'] = FP/float(N) roc['tpr'] = TP/float(P) roc['TP'] = TP roc['FP'] = FP return roc
[docs] def compute_roc(self, cellLine, ligand): """Compute the ROC curve :param scores: list of scores (probabilities) :param classes: list of classes (true values) .. seealso:: :meth:`compute_roc` """ scores = self.descendancy_matrices[cellLine][ligand][self.mTor_index[cellLine]] # remove the None otherwise classes and scores are different classes = [x for x in self.true_descendants[cellLine][ligand] if x!=None] if self.robustness_testing: N = len(scores) from random import sample indices = sample(range(0,N), int((1-self.masking)*N)) scores = np.array([s for i,s in enumerate(scores) if i in indices]) classes = [s for i,s in enumerate(classes) if i in indices] assert len(scores) == len(classes) sorted_indices = np.argsort(1-scores) sorted_scores = [scores[i] for i in sorted_indices] sorted_classes = [classes[i] for i in sorted_indices] P = float(classes.count(1)) N = float(classes.count(0)) FP = 0 TP = 0 # roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[], "recall":[], "Fmeasure":[], 'threshold':[]} fprev = 1.1 # scores are less than 1 # an efficient algorithm to compute ROC based on T.Fawcett Pattern # Recognition Letters 27, 2006 i = 0 while i<len(scores): cls = sorted_classes[i] score = sorted_scores[i] if score != fprev and cls!=None: thisroc = self._compute_fpr_tpr(FP, TP, N, P) for key in ['fpr', 'tpr', "FP", "TP"]: roc[key].append(thisroc[key]) roc['threshold'].append(fprev) fprev = score if cls == 1: TP += 1 elif cls == 0: FP += 1 elif cls == None: pass else: raise ValueError("found a class value different from 0,1 or None") i += 1 thisroc = self._compute_fpr_tpr(FP, TP, N, P) for key in ['fpr', 'tpr', "FP", "TP"]: roc[key].append(thisroc[key]) roc['threshold'].append(sorted_scores[-1]) roc = self.compute_other_metrics(roc) return roc
[docs] def compute_other_metrics(self, roc): #Be aware that there is alway a first value TP=0,FP=0 # this should be handled with care the recall/precision computatio # This first value is used to compute fpr and tpr but should not be used # here FP = np.array([float(x) for x in roc['FP']]) TP = np.array([float(x) for x in roc['TP']]) N = max(FP) + max(TP) # The first value of precision is undefined since FP+TP=0 # It does not matter since it will be overwritten as explained here # below precision = TP / (FP+TP) recall = TP/float(max(TP)) # The second value may also be be an issue. There are 2 cases. # 1. TP>0 # 2. TP == 0 # In the second case, TP=0 and FP==0 should not happen. FP > 0 instead. # In which case, recall = 0 and precision=0. No ambiguity # In the first case, if TP>0 then precision >0 and recall > 0. # Therefore we end up without a defined value at recall==0 # So we should set a recall=0 for which precision will be the same as # the one where recall >0 # somehow, precision and recall if TP[1]>0: precision[0] = precision[1] else: precision[0] = 0 roc['precision'] = [float(x) for x in precision] roc['recall'] = [float(x) for x in recall] return roc
[docs] def compute_auc(self, roc): """Compute AUC given a ROC data set :param str roc: The roc data structure must be a dictionary with "tpr" key. Could be an variable returned by :meth:`compute_roc`. """ import scipy.integrate value = scipy.integrate.trapz(roc['tpr'], roc['fpr']) return value
def _get_scores(self, cellLine, ligand): """Returns scores of a given descendancy matrix taking care of mTor index""" scores = self.descendancy_matrices[cellLine][ligand][self.mTor_index[cellLine]] return scores def _get_classes(self, cellLine, ligand): classes = self.true_descendants[cellLine][ligand] return classes
[docs] def compute_all_rocs(self): """Computes all ROC curves This function can be called once EDA files are loaded and all descendant matrices have been computed as well. .. seealso:: :meth:`load_all_eda_files_from_zip`, :meth:`compute_all_descendant_matrices` """ for c in self.valid_cellLines: for l in self.valid_ligands: roc = self.compute_roc(c, l ) self.roc[c][l] = roc.copy()
[docs] def compute_all_aucs(self): """Computes all AUC This function can be called once EDA files are loaded and all descendant matrices have been computed as well. In theory, one should compute ROC and then AUC but this function recomputes ROC since it is fast to compute. .. seealso:: :meth:`load_all_eda_files_from_zip`, :meth:`compute_all_descendant_matrices` """ self.validation() self.compute_all_descendant_matrices() for c in self.valid_cellLines: for l in self.valid_ligands: roc = self.compute_roc(c, l) self.roc[c][l] = roc.copy() auc = self.compute_auc(roc) self.auc[c][l] = auc
[docs] def compute_all_auprs(self): for c in self.valid_cellLines: for l in self.valid_ligands: roc = self.compute_roc(c, l ) self.roc[c][l] = roc.copy() aupr = self.compute_aupr(roc) self.aupr[c][l] = aupr
[docs] def compute_all_metrics(self): for c in self.valid_cellLines: for l in self.valid_ligands: roc = self.compute_metrics(c, l) self.roc[c][l] = roc.copy() auc = self.compute_aupr(roc) self.aupr[c][l] = auc
[docs] def compute_aupr(self, roc): import scipy.integrate value = scipy.integrate.trapz(roc['precision'], x=roc['recall']) return value
[docs] def get_aucs(self): """Returns all AUCs""" values = [self.auc[k1][k2] for k1 in self.auc.keys() for k2 in self.auc[k1].keys()] return values
[docs] def get_average_auc(self): """Returns mean of all AUCs""" values = self.get_aucs() return np.mean(values)
[docs] def plot_roc(self, cellLine, ligand, hold=False): """Plots a psecific ROC curve""" from pylab import plot, clf, grid, title x = self.roc[cellLine][ligand]['fpr'] y = self.roc[cellLine][ligand]['tpr'] auc = self.auc[cellLine][ligand] if hold == False: clf() else: from pylab import hold hold(True) plot(x, y, '-o') plot(x,x,'r') title("ROC for %s/%s (AUC=%s)" % (cellLine, ligand, auc)) grid(True)
[docs] def get_null_distribution(self, sample=100, cellLine="BT20", ligand="EGF", store_rocs=False, distr="uniform"): """Computes the null distribution for a given combinaison * Creates a uniformly distribution of a EDA file and stores it in the edge_score attribute. * recompute the corresponding descendancy matrix * Get the corresponding true prediction * compute the ROC and AUC :param int sample: number of distribution to compute :param cellLine: :param ligand: :param bool store_rocs: if set to True, save the rocs as well :return: rocs and aucs (rocs is set to [] for debugging) .. plot:: :include-source: from dreamtools.dream8.D8C1 import scoring from pylab import clf, plot, hist, grid, pi, exp, sqrt, mean, std s = scoring.HPNScoringNetwork() rocs, aucs, auprs = s.get_null_distribution(100) mu = mean(aucs) sigma = std(aucs) clf() res = hist(aucs,50, normed=True) plot(res[1], 1/(sigma * sqrt(2 * pi)) * exp( - (res[1] - mu)**2 / (2 * sigma**2) ), linewidth=2, color='r') grid() """ aucs = [] #rocs = [] auprs = [] N = self.valid_length[cellLine] c = cellLine l = ligand for i in range(0, sample): if distr == "inverse": x = [float(x)/(N*N) for x in range(1, N*N + 1)] np.random.shuffle(x) # in line shuffling #reshape and create numpy array self.edge_scores[c][l] = np.array(x).reshape(N,N) elif distr == "uniform": self.edge_scores[c][l] = np.random.uniform(size=N*N).reshape(N,N) self.compute_descendant_matrix(c,l) roc = self.compute_roc(c, l) auc = self.compute_auc(roc) aupr = self.compute_aupr(roc) if store_rocs: rocs.append(roc) aucs.append(auc) auprs.append(aupr) return rocs, aucs, auprs
[docs] def plot_all_rocs(self, cellLines=None): """Plots all 32 ROC once scores/rocs have been computed .. plot:: :include-source: :width: 80% from pylab import clf, plot, hist, grid from dreamtools.dream8.D8C1 import scoring import os s = scoring.HPNScoringNetwork() from dreamtools import D8C1 filename = D8C1().download_template('SC1A') s.load_submission(filename) s.compute_score() s.plot_all_rocs() """ if cellLines is None: cellLines = self.valid_cellLines else: cellLines = [cellLines] pylab.clf() for c in cellLines: for l in self.valid_ligands: x = self.roc[c][l]['fpr'] y = self.roc[c][l]['tpr'] #auc = self.auc[c][l] pylab.plot(x, y, '-o', label="%s-%s" % (c,l)) pylab.hold(True) pylab.title("32 ROCs") pylab.grid(True) pylab.plot(x, x, '--k', linewidth=2)
[docs] def get_mean_zscores(self, aucs=None): zscores = self.get_zscores(aucs) mu = np.mean([zscores[k1][k2] for k1 in zscores.keys() for k2 in zscores[k1].keys()]) return mu
[docs] def get_zscores(self, aucs=None): """ """ mu, sigma = self.get_mean_and_sigma_null_parameters() zscores = {} for c in self.valid_cellLines: zscores[c] = {} for l in self.valid_ligands: if aucs: zscores[c][l] = (aucs[c][l] - mu[c][l])/sigma[c][l] else: zscores[c][l] = (self.auc[c][l] - mu[c][l])/sigma[c][l] return zscores
[docs] def print_aucs(self): # the order here is defined to compare with steven results but could be # chnaged later on. self.compute_all_aucs() for c in self.cellLines_names_steven: #["UACC812","BT549","MCF7","BT20"]: print(c, "\t"), for l in self.ligands_names_steven: # ["Serum","PBS", "EGF", "Insulin", "FGF1", "HGF", "NRG1", "IGF1"]: roc = self.compute_roc(c, l) auc = self.compute_auc(roc) print('AUC',auc,"\t "), print()
[docs] def get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 32 combi from a null AUC distribution""" import sc1a_tools null = sc1a_tools.AUCnull(self.valid_cellLines, self.valid_ligands, verbose=False) filename = 'sc1a_null_aucs_mean_sigma.json' null.loadaucs(filename) mean = null.get_mean_dict() sigma = null.get_sigma_dict() return mean, sigma
[docs]class HPNScoringNetwork_ranking(HPNScoring): """This class is used to compute the ranks of the different participants based on an average rank over the 32 combinaisons of cell line and ligands. :: s = HPNScoringNetwork(filename="file1zip") s.compute_all_aucs() sall = HPNScoringNetwork_all() # s.aucs is a list where each element is a dictionary of sall.add_auc(s1.auc, "team1") # let us build aucs2 = copy.deepcopy(s.auc) for c in s.valid_cellLines: for l in s.valid_ligands: auc2[c][l] = numpy.random.uniform(0.5,0.7) sall.add_auc(s2.auc, "team2") sall.get_ranking() {'team1': 1.96875, 'team2': 1.03125} This class is independent of HPNSCoringNetwork. However, it takes as input the returned values of HPNScoringNetwork.compute_all_auc() """ def __init__(self): super(HPNScoringNetwork_ranking, self).__init__() self.aucs = [] self.participants = [] # we remove here the non-robust regime. # communication with steven the 4th Nov to inclde # UACC812,IGF1 The 3 to be removed are: # BT549,FGF1 BT549,NRG1 BT20,insulin # update from Nov 20th, BT549 FGF1 is now back into business self.valid_ligands_final = { 'UACC812': ['PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1', 'IGF1'], 'BT20': ['IGF1', 'PBS', 'Serum', 'NRG1', 'HGF', 'EGF', 'FGF1'], 'BT549': ['IGF1', 'PBS', 'Serum', 'Insulin', 'HGF', 'EGF', 'FGF1'], 'MCF7': ['IGF1', 'PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1'] }
[docs] def get_empty_auc(self): auc = {} for c in self.valid_cellLines: auc[c] = {} for l in self.valid_ligands_final[c]: auc[c][l] = 0 return auc
[docs] def add_auc(self, auc, participant_id): if participant_id in self.participants: raise ValueError("participant already added (%s)" % participant_id) if auc == None: auc = self.get_empty_auc() else: # copy only the combi for the final leaderboard. temp_auc = {} for c in self.valid_cellLines: temp_auc[c] = {} for l in self.valid_ligands_final[c]: temp_auc[c][l] = auc[c][l] self.aucs.append(temp_auc) self.participants.append(participant_id) self._compute_ranks()
def _rank_index(self, vector): sorted_vector = sorted(vector) return [sorted_vector.index(x) for x in vector] def _compute_ranks(self, invalid=False): """If there are some invalid submissions, this may not work anymore """ N = len(self.aucs) ranks = {} for c in self.valid_cellLines: ranks[c] = {} for l in self.valid_ligands_final[c]: # for final scores, some combi are removed # get this AUC for all participant aucs = [x[c][l] for x in self.aucs] M = len([x[c][l] for x in self.aucs if x[c][l]==0]) if M>0: print("what is M: ",M) # but use a reverse order hence 1-x for the sorting if invalid: indices = np.argsort([1-x for x in aucs]) else: indices = [x+1 for x in self._rank_index([1-x for x in aucs])] # need to append all participants that are invalid with same # rank that is maxRank if invalid: ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))] # reset all final values to max rank maxRank = N-M+1 ranks[c][l] = [x if x<maxRank else maxRank for x in ranks[c][l]] else: ranks[c][l] = indices[:] self.ranks = copy.deepcopy(ranks)
[docs] def get_ranking(self): ranking = {} for k in self.participants: ranks = self.get_rank_participant(k) mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()]) ranking[k] = mu return ranking
[docs] def get_mean_ranks(self): return self.get_ranking()
[docs] def get_integer_ranks(self): ranking = self.get_ranking() integer_ranks = {} # get teams and ranks teams_ranks = list(ranking.items()) teams = [x[0] for x in teams_ranks] mean_ranks = [x[1] for x in teams_ranks] for i, index in enumerate(np.argsort(mean_ranks)): integer_ranks[teams[int(index)]] = i+1 return integer_ranks
[docs] def get_rank_participant(self, participant): ranks = {} if participant in self.participants: index = self.participants.index(participant) for c in self.valid_cellLines: ranks[c] = {} for l in self.valid_ligands_final[c]: ranks[c][l] = self.ranks[c][l][index] return ranks else: raise ValueError("unknown participant")
# duplicated from HPNScoringNetwork to update leadeboard easily
[docs] def get_mean_zscores(self): zscores = {} for i,k in enumerate(self.participants): zscore = self._get_zscores(self.aucs[i]) mu = np.mean([zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()]) zscores[k] = mu return zscores
# duplicated from HPNScoringNetwork to update leadeboard easily def _get_zscores(self, aucs): """ """ mean, sigma = self._get_mean_and_sigma_null_parameters() zscores = {} for c in self.valid_cellLines: zscores[c] = {} for l in self.valid_ligands_final[c]: zscores[c][l] = (aucs[c][l] - mean[c][l])/sigma[c][l] return zscores # duplicated from HPNScoringNetwork to update leadeboard easily def _get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 32 combi from a null AUC distribution""" from . import sc1a_tools null = sc1a_tools.AUCnull(self.valid_cellLines, self.valid_ligands, verbose=False) filename = 'sc1a_null_aucs_mean_sigma.json' filename = self.getpath_data(filename) null.loadaucs(filename) mean = null.get_mean_dict() sigma = null.get_sigma_dict() return mean, sigma
[docs]class HPNScoringNetworkInsilico(HPNScoringNetworkBase): """Scoring class for HPN DREAM8 Network Insilico sub challenge This class retrieves the true graph and a test example from synapse. :: from dreamtools.dream8.D8C1 import HPNScoringNetworkInsilico s = HPNScoringNetworkInsilico() import os filename = s.download_template("SC1B") s.read_file(filename) .. note:: If you want to test your own local file, provide a filename. """ test_synapse_id = "syn1973430" true_synapse_id = "syn1976597" def __init__(self, filename=None, verbose=False): super(HPNScoringNetworkInsilico, self).__init__(filename, verbose=verbose) self.read_file(filename) self.true_graph = self._load_true_graph_from_zip()
[docs] def read_file(self, filename): self.filename = filename try: self.loadZIPFile(self.filename) except Exception as err: print(err) # it not read permission loadZIP and get_eda will fail, self.error("Could not read the data (invalid ZIP ?)") try: self.user_graph = self._get_participant_eda() except Exception as err: print(err) # it not read permission loadZIP and get_eda will fail, self.error("Could not compute EDA")
def _load_true_graph_from_zip(self): """Reads true graph provided by Steven 10 July 2013 The ZIP file should contain a 20 by 20 matrix """ filename = self.getpath_gs("TrueGraph.csv") reader = csv.reader(open(filename, "r")) data = np.array(list(reader), dtype="float") return data def _get_participant_eda(self): """This function retrieves the EDA file from a participant :return: a numpy matrix with column and row names stored in :attr:`species` """ filenames = [x for x in self.zip_filenames if x.endswith(".eda") and "__MACOSX" not in x] assert len(filenames) == 1 filename = filenames[0] node1, node2, edges, scores = self.get_eda_file(filename) N = 20 species = ["AB"+str(i) for i in range(1,N+1)] tempdata = np.zeros((N, N)) for x,y,z in zip(node1, node2, scores): i = species.index(x.upper()) j = species.index(y.upper()) tempdata[i][j] = z tempdata = abs(tempdata) M = tempdata.max() if M>1: tempdata /= float(M) #self.species = species[:] return tempdata
[docs] def to_eda(self, filename): """EXports the user EDA file""" fh = open(filename, "w") fh.write("EdgeScore\n") for i in range(0,20): for j in range(0,20): fh.write("AB%s (1) AB%s = %s\n" % (i+1, j+1, self.user_graph[i][j])) fh.close()
[docs] def get_roc(self): """Gets a ROC instance using thegiven the user and true graphs as inputs""" roc = ROC() roc.scores = self.user_graph.flatten() roc.classes = self.true_graph.flatten() return roc
[docs] def get_auc(self): roc = ROC() roc.scores = self.user_graph.flatten() roc.classes = self.true_graph.flatten() roc_data = roc.get_roc() auc = roc.compute_auc(roc_data) return auc
def _get_auc(self): return self.get_auc() auc = property(_get_auc)
[docs] def get_null_auc_aupr(self, N): """Get null distribution of the AUCs and AUPRs :param int N: number of samples :return: tuple made of 2 lists: the AUCc ad AUPRs """ roc = rocs.ROC() roc.scores = self.user_graph.flatten() roc.classes = self.true_graph.flatten() aucs = [] auprs = [] for i in range(0, N): if divmod(i, 1000)[1] == 0 and i!=0: print("%s completed" % (i/float(N)*100)) roc.scores = np.random.uniform(0,1,400) roc_data = roc.get_roc() auc = roc.compute_auc(roc_data) aupr = roc.compute_aupr(roc_data) aucs.append(auc) auprs.append(aupr) return aucs, auprs
[docs] def plot_null_distribution(self, aucs=None, auprs=None, N=10000): """Plots the null distribution of the AUCs .. plot:: :include-source: :width: 80% from dreamtools.dream8.D8C1 import HPNScoringNetworkInsilico from dreamtools import D8C1 import os s = HPNScoringNetworkInsilico() filename = D8C1().download_template('SC1B') s.read_file(filename) aucs, auprs = s.get_null_auc_aupr(1000) s.plot_null_distribution(aucs) from pylab import xlim xlim([0.35,0.65]) """ if aucs == None and auprs == None: aucs, auprs = self.get_null_auc_aupr(N) if aucs: pylab.figure(1) pylab.clf() pylab.hist(aucs, bins=100, normed=True) pylab.grid(True) pylab.xlim([0,1]) pylab.title("AUCs null distribution") import scipy.stats mu, sigma = scipy.stats.norm.fit(aucs) X = pylab.linspace(0,1,100) pylab.plot(X, scipy.stats.norm.pdf(X, loc=mu, scale=sigma), "r") #except Exception: # raise Exception pylab.savefig("SC1B_aucs_null_distribution.png") if auprs: pylab.figure(2) pylab.clf() auprs2 = [x for x in auprs if np.isnan(x)==False] pylab.hist(auprs2, bins=100, normed=True) pylab.xlim([0,1]) pylab.grid(True) pylab.title("AUCs null distribution") pylab.savefig("SC1B_auprs_null_distribution.png")
[docs] def get_zscore(self): """Returns scores for the current submission :return: a single value based on the assumption that the distribution of the NULL AUC follows a gaussian distribution with parameters that are hardcoded as mu=0.497404 and std=0.037436. aucs2, auprs2 = s.get_null_auc_aupr(500000) scipy.stats.gamma.fit([x for x in auprs if numpy.isnan(x)==False]) scipy.stats.norm.fit(aucs) """ # based on 1,000,000 simu norm_params = {"mu": 0.4974677, "sigma": 0.0373782} #gamma_params = { # "a": 7.23283255, # "loc": 0.13995634, # ":scale": 0.00838379} roc = self.get_roc() roc_data = roc.get_roc() auc = roc.compute_auc(roc_data) #aupr = roc.compute_aupr(roc_data) return (auc-norm_params['mu'])/ norm_params['sigma']
[docs] def compute_score(self): """The official score for the SC1B challenge :return: zscore """ try: zscore = self.get_zscore() return zscore except: pass
class HPNScoringPredictionBase(HPNScoring): def __init__(self, filename=None, verbose=False): super(HPNScoringPredictionBase, self).__init__(verbose=verbose) self.times = [0, 5, 15, 30, 60, 120,240] self.load_species() self.filename = filename
[docs]class HPNScoringPrediction(HPNScoringPredictionBase): test_synapse_id = "syn2000886" true_synapse_id = "syn2009136" # for now it is local in the SC2A file. Could use # theMIDAS as well def __init__(self, filename=None, version=2, verbose=False): super(HPNScoringPrediction, self).__init__(filename) self.loadZIPFile(self.filename) # version 1 is the first LB before publication # version 2 is the corrected and final LB after publication assert version in [1, 2] self._version = version c = Challenge('D8C1') filename = c.get_pathname("TruePrediction.zip") self.true_desc_filename = filename # same as species_to_ignore + mTOR + target of the inhibitors email # steven hill 10/7/2013 self.phosphos_to_exclude = { 'MCF7': ['TAZ_pS89', 'FOXO3a_pS318_S321', 'mTOR_pS2448'], 'UACC812': ['TAZ_pS89','mTOR_pS2448'], 'BT20': ['TAZ_pS89', 'FOXO3a_pS318_S321', 'mTOR_pS2448'], 'BT549': ['TAZ_pS89','mTOR_pS2448'] } self.rmse = {} self.get_true_prediction() self.get_user_prediction()
[docs] def get_user_prediction(self): """ should be MIDAS files as in https://www.synapse.org/#!Synapse:syn1973835 """ # look only at inhibitor 3 self.user_prediction = {} filenames = [x for x in self.zip_filenames if x.endswith(".csv") if "TestInhib3" in x and "MACOS" not in x] if len(filenames) != 4: self.error("Got (%s) files with a correct pattern. Expected 4" % len(filenames)) print(filenames) if self.verbose: print("Loading MIDAS files") for filename in filenames: tail = os.path.split(filename)[-1] team, cell, ligand, other = tail.split("-") if cell not in self.valid_cellLines: self.error("filename with invalid cellLine %s " % cell) # not easy to find universal way of reading zip file and decode it # under max/linux/windows across py2 and py3... zipdata = self.zip_data.read(filename).decode() zipdata = zipdata.splitlines() #data_iter = csv.reader(zipdata, delimiter=",") # get the header header = zipdata[0] header = [x.strip() for x in header.split(",")] # There should be 56 lines exacly with the 8 stimuli times 7 times # for all phosphos if len(header) == 0: self.error("Cannot read the header : %s in %s" % (header, filename )) return if header[0] != "TR:%s:CellLine" % cell: self.error("Error in the header of %s (first row)" % filename ) stimuli = [x.split(":")[1] for x in header[1:9]] if sorted(stimuli) != sorted(self.valid_ligands): self.error("Error in the header of %s (invalid stimuli ?)" % filename ) if header[9] not in ["DA:ALL", "DA::ALL"]: self.error("Expected a DA:ALL column in the header at column 10, which was not found") phosphos = [x.split(":")[1] for x in header[10:]] if len(phosphos) != self.valid_length[cell] and len(phosphos) != self.valid_length_extended[cell]: print("WARNING: length don't seem correct") if "PKC-delta_pS664" in phosphos: index = phosphos.index("PKC-delta_pS664") phosphos[index] = "PKC-delta" self.user_prediction[cell] = {} for phospho in phosphos: self.user_prediction[cell][phospho] = {} for ligand in self.valid_ligands: self.user_prediction[cell][phospho][ligand] = [0,0,0,0,0,0,0] # 7 times # TODO: check that phosphos agree with the true prediction ? for iter, row in enumerate(zipdata[1:]): row = row.split(",")# get time index if len(row)==0: continue time = int(row[9]) itime = self.times.index(time) # get the stimuli name for that row this_stimuli = [int(x) for x in row[1:9]] if sum(this_stimuli) !=1: self.error("Error: found a row with 0 or more than 1 stimuli on row %s (cell %s)" % (iter, cell)) this_stimulus = self.valid_ligands[this_stimuli.index(1)] # get the unique ligand name for ip, phospho in enumerate(phosphos): datum = row[10 + ip] if datum == "NA": datum = np.nan else: try: datum = float(datum) except Exception: print(row) raise Exception self.user_prediction[cell][phospho][this_stimulus][itime] = datum
[docs] def get_true_prediction(self): """Reads true predcition from the 4 CSV files that contain the true prediction data is stored as follows in the tru_prediction attribute: """ self.true_prediction = {} zipdata = zipfile.ZipFile(self.true_desc_filename) for cell in self.valid_cellLines: self.true_prediction[cell] = {} filename = [x for x in zipdata.namelist() if "%s_main_Test.csv" % cell in x][0] data = zipdata.read(filename).decode().splitlines() #filename = "SC2A/%s_main_Test.csv" % cell #print("Reading true prediction from %s" % filename) #fh = open(filename, "r") #data = csv.reader(fh) # skip first line data = data[1:] if cell == "UACC812": # skip another line data = data[1:] names = data[0][:] hugoId = data[1][:] # skip another line data = data[3:] #dummy = str(next(data)) names = [x.strip() for x in names.split(",")] # now the data with col1 = cellline, co2=inhibito, col3=stimulus, # col4=timepoints indices_to_exclude = [0, 1, 2, 3] # create the phospho keys for this in self.phosphos_to_exclude[cell]: index = names.index(this) indices_to_exclude.append(index) # create the key to hold this cell line self.true_prediction[cell] = {} # create the key to hold all phosphos for i,name in enumerate(names): if i not in indices_to_exclude: self.true_prediction[cell][name] = {} # create the key to hold all ligans for ligand in self.valid_ligands: # build up the 7-length vector to hold the data # each element is a list so that we can append # replicates if any # note a python caveat [[]] * 7 creates 7 references # empty lists. don't use. self.true_prediction[cell][name][ligand] = [[],[],[],[],[],[],[]] else: pass # now scan the data times = ["0min", "5min", "15min", "30min", "60min", "2hr", "4hr"] for row in data: row = [x.strip() for x in str(row).split(",")] if row[1] == "AZD8055": # this is the inhibitor we are interested in ligand = row[2] if ligand not in self.valid_ligands and ligand!="": raise ValueError("Invalid ligand") # get time and correspoding index in the list of valid times thistime = row[3] time_index = times.index(thistime) # append the data to the relevant phospho index for i, phospho in enumerate(names): # ignore the indices related to FOX, TAX, mTor if i not in indices_to_exclude: datum = row[i] if ligand == "": # happends for time=0min only assert time_index==0 for this_ligand in self.valid_ligands: self.true_prediction[cell][phospho][this_ligand][0].append(float(datum)) else: assert time_index!=0 self.true_prediction[cell][phospho][ligand][time_index].append(float(datum)) # Finally, we need to get the average to take care of replicates for phospho in self.true_prediction[cell].keys(): for ligand in self.true_prediction[cell][phospho].keys(): for i,time in enumerate(times): vector = self.true_prediction[cell][phospho][ligand][i] if len(vector)==0: m = np.nan else: m = 2**(sum([np.log2(x) for x in vector])/float(len(vector))) self.true_prediction[cell][phospho][ligand][i] = m # some sanity checks for key in self.valid_length.keys(): assert self.valid_length_extended[key]-len(self.phosphos_to_exclude[key]) == len(self.true_prediction[key].keys())
[docs] def get_rmse(self, cellLine, phospho): """ .. warning:: x in converted into a log2 scale """ RMSE = 0 #T = 7. #S = 8. counter = 0 for s in self.valid_ligands: true = np.array(self.true_prediction[cellLine][phospho][s]) user = np.array(self.user_prediction[cellLine][phospho][s]) # some values provided may be set to 0 # so remove the infinite caused by log(0). #RMSE += nansum( [x for x in (log2(true)-log2(user))**2 if x!=inf] ) data = [x for x in (np.log2(true)-np.log2(user))**2 if x!=np.inf] N = len([x for x in data if np.isnan(x) == False]) try: import bottleneck as bn RMSE += bn.nansum(data) except: RMSE += np.nansum(data) #RMSE += np.nansum(data) counter += N # TODO: if nans are include, do we need to change T ? if counter != 0: RMSE /= float(counter) RMSE = np.sqrt(RMSE) else: RMSE = np.nan return RMSE
[docs] def compute_all_rmse(self): """Some species were removed on purpose during the analysis Those are hardcoded. To compute null distribution, we can keep all the species, in which case, _version parameter must be 0 must be set to False. """ self.rmse = {} for c in self.valid_cellLines: self.rmse[c] = {} for l in self.species[c]: if self._version == 1: if c == "MCF7" and l in [ 'c-Met_pY1235', "YB-1_PS102"]: continue if c == "BT20" and l in ["4EBP1_pS65", '4EBP1_pT37_pT46', "c-Met_pY1235"]: continue if c == "UACC812" and l in ["4EBP1_pT37_pT46", "4EBP1_pS65"]: continue elif self._version == 2: if c == "MCF7" and l in ['c-Met_pY1235', "YB-1_PS102", "PDK1_pS241"]: continue if c == "BT20" and l in ["4EBP1_pS65", '4EBP1_pT37_pT46', "c-Met_pY1235"]: continue if c == "UACC812" and l in ["4EBP1_pT37_pT46", "4EBP1_pS65", "CHK1_pS345", "NDRG1_pT346", 'PDK1_pS241', 'PKC-alpha_pS657', 'PKC-delta']: continue if c == 'BT549' and l in ['BAD_pS112', 'HER3_pY1298', 'PKC-delta']: continue else: # keep going don't skip anything (for null distro) pass if l not in self.phosphos_to_exclude[c]: rmse = self.get_rmse(c, l) self.rmse[c][l] = rmse else: if self.verbose:print("skip %s/%s" % (c,l))
[docs] def get_training_data(self): self.training = {} filenames = [ "MD-BT20_main_for_null.csv", "MD-BT549_main_for_null.csv", "MD-MCF7_main_for_null.csv", "MD-UACC812_main_for_null.csv"] for filename in filenames: dummy, cell = filename.split("-") cell, other = cell.split("_", 1) filename = self.getpath_data(filename) if self.verbose: print("Scanning %s" % filename) data_iter = csv.reader(open(filename, "r"), delimiter=",") # get the header header = data_iter.next() # There should be 56 lines exacly with the 8 stimuli times 7 times # for all phosphos if header[0] != "TR:%s:CellLine" % cell: self.error("Error in the header of %s (first row)" % filename ) stimuli = [x.split(":")[1] for x in header[1:9]] if sorted(stimuli) != sorted(self.valid_ligands): self.error("Error in the header of %s (invalid stimuli ?)" % filename ) N = 3 if cell == "BT549": N = 2 #inhibitors = [] if header[8+N+1] != "DA:ALL": self.error("Expected a DA:ALL column in the header at column 11 or 12, which was not found") phosphos = [x.split(":")[1] for x in header[N+8+2:]] if len(phosphos) != self.valid_length[cell] and len(phosphos) != self.valid_length_extended[cell]: print("WARNING: length don't seem correct") #if "PKC-delta_pS664" in phosphos: # index = phosphos.index("PKC-delta_pS664") # phosphos[index] = "PKC-delta" self.training[cell] = {} for phospho in phosphos: self.training[cell][phospho] = [] # TODO: check that phosphos agree with the true prediction ? for iter, row in enumerate(data_iter): # append all data for a given phosphos over all stims, # inhibitor+DMSO and times for ip, phospho in enumerate(phosphos): datum = row[N+8+2+ip] if datum == "NA": datum = np.nan else: datum = float(datum) self.training[cell][phospho].append(datum)
[docs] def get_null(self, N=100, tag="sc2a"): """ :: s = HPNScoringPrediction() nulls = s.get_null(1000) # the nulls contains the 4 cell lines # let us save the first one for name in ['UACC812', 'BT549', 'MCF7', 'BT20']: data = [x[name] for x in nulls] fh = open('%s.json' % name, 'w') import json json.dump(data, fh) fh.close() """ self.get_training_data() all_rmses = [] for i in range(0, N): print(tag + " " + str(i)) temp = self._filter_species self.user_prediction = self.create_random_data() self.compute_all_rmse() self._filter_species = temp all_rmses.append(copy.deepcopy(self.rmse)) return all_rmses
[docs] def create_random_data(self): """ Here, we don't want the true prediction that contains only what is requested (AZD8055) but the orignal training data with 2 or 3 inhibitors such as GSK and PD17 so that we can shuffle them. We want to select for a given cell line and phosphos a data set to fill at a given time. The datum is selected accross the 8 stimuli, inhibitors +DMSO, and time points. TAZ and FOX were asked to be excluded so this cause some trouble now but some user preidction still include them. Should add a if statement to ignore them. Does not matter to compute the null distribution """ import random #data = copy.deepcopy(self.user_prediction) data = {} self.compute_all_rmse() for c in self.rmse.keys(): data[c] = {} for p in self.rmse[c].keys(): data[c][p] = {} #for s in data[c][p].keys(): for s in self.ligands_names_steven: # select 7 random values for 7 time points for each stimuli data[c][p][s] = random.sample(self.training[c][p], 7)[:] return data
def _get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 4 celllines and phosphos""" filename = 'sc2a_null_mu_sigma_new.json' filename = os.sep.join(["data", filename]) with open(filename) as fh: mu_sigma = json.load(fh) mu = {} sigma = {} for c in self.valid_cellLines: mu[c] = {} sigma[c] = {} for p in self.valid_phosphos[c]: mu[c][p] = mu_sigma[c][p]['mu'] sigma[c][p] = mu_sigma[c][p]['sigma'] return mu, sigma
[docs] def get_mean_rmse(self): return np.nanmean([self.rmse[k1][k2] for k1 in self.rmse.keys() for k2 in self.rmse[k1].keys()])
[docs]class HPNScoringPredictionInsilico(HPNScoringPredictionBase): """ dimension1 :inhibitor dimenssion2: phosp dimensson3 stimulus dimnesion4 : time """ test_synapse_id = "syn2009175" true_synapse_id = "syn2143242" def __init__(self, filename=None, verbose=False, version=2): """SC2B sub challenge (prediction in silico) :param filename: file to score :param str version: default to 'official' (see note below). Set to anything else to use correct network .. note:: This code use the official gold standard used in https://www.synapse.org/#!Synapse:syn1720047/wiki/60532 . Note, however, that a new corrected version is now provided and may be used. Differences with the official version should be small and have no effect on the ranking shown in the synapse page. """ super(HPNScoringPredictionInsilico, self).__init__(filename) # WRONG NETWORK as used in the official LB before final corrections self.version = version if self.version == 1: c = Challenge('D8C1') fname = c.get_pathname("TruePredictionInsilico.zip") elif self.version == 2: # CORRECT NETWORK c = Challenge('D8C1') fname = c.get_pathname("TruePredictionInsilico2.zip") else: raise ValueError("version must be either 1 or 2") self.true_desc_filename = fname #self.loadZIPFile(self.filename) self.valid_cellLines = [""] self.times = [0, 1,2,4,6,10,15,30,60,120] self.stimuli = ["loLIG1", "hiLIG1", "loLIG2", "hiLIG2", "loLIG1_loLIG2", "loLIG1_hiLIG2", "hiLIG1_loLIG2", "hiLIG1_hiLIG2"] self.inhibitors = ["AB%s"%x for x in range(1,21)] self.phosphos = ["AB%s"%x for x in range(1,21)] if self.filename is not None: self.get_user_prediction() #except: # print("could not read user prediction") self.get_true_prediction()
[docs] def get_user_prediction(self): results = self.read_prediction_insilico(self.filename) self.user_prediction = copy.deepcopy(results)
[docs] def get_true_prediction(self): #results = self.read_prediction_insilico(self.true_desc_filename) results = self.read_true_prediction_michael(self.true_desc_filename) self.true_prediction = copy.deepcopy(results)
[docs] def read_true_prediction_michael(self, filename): # this is almost the same as in the read_prediction results = {} zipdata = zipfile.ZipFile(filename) for inhib in self.inhibitors: # AB19 is missing from Michael set. expected_ending = "-%s-Prediction-Insilico.csv" % inhib filename = [x for x in zipdata.namelist() if x.endswith(expected_ending)] if len(filename) != 1: self.error("Could not find file ending in %s" % expected_ending) break filename = filename[0] # initialise dictionary for the inhibitor results[inhib] = {} # read data data = zipdata.read(filename).decode() # from bytes to str data = data.splitlines() # get header header = data[0] header = [x.strip() for x in header.split(",")] stimuli = [x.split(":")[1] for x in header if x.endswith("Stimuli")] phosphos = [x.split(":")[1] for x in header if x.startswith("DV:")] assert len(phosphos)==20 assert len(stimuli) == 4 assert stimuli == self.stimuli[0:4], "incorrect stimuli %s " % stimuli # create the key to hold all phosphos for i,phospho in enumerate(phosphos): results[inhib][phospho] = {} # create the key to hold all ligans for stimulus in self.stimuli: results[inhib][phospho][stimulus] = [[] for i in range(10)] else: pass # now scan the data for row in data[1:]: row = [x.strip() for x in row.split(",")] these_stimuli = row[1:5] #we have only 4 stimuli im Michael file. # However, several may be st to 1 stim_index = [i for i,stim in enumerate(these_stimuli) if stim == '1' ] assert len(stim_index)==1 or len(stim_index)==2 if len(stim_index) == 1: stim_index = stim_index[0] else: # ["loLIG1", "hiLIG1", "loLIG2", "hiLIG2", "loLIG1_loLIG2", "loLIG1_hiLIG2", "hiLIG1_loLIG2", "hiLIG1_hiLIG2"] if sorted(stim_index) == [0,2]: #loLIG1_loLIG2 stim_index = 4 elif sorted(stim_index) == [0,3]: #loLIG1_hiLIG2 stim_index = 5 elif sorted(stim_index) == [1,2]: #hiLIG1_loLIG2 stim_index = 6 elif sorted(stim_index) == [1,3]: #hiLIG1_hiLIG2 stim_index = 7 else: raise ValueError # get time and correspoding index in the list of valid times thistime = float(row[5]) if thistime == 45: # to skip. THere is an extra time to be ignored since we # asked users not to provide the time 45 continue time_index = self.times.index(thistime) # append the data to the relevant phospho index for i, phospho in enumerate(phosphos): # ignore the indices related to FOX, TAX, mTor datum = row[i+1+4+1] # +cell line +1 time + 4 stimuli if datum == "NA": results[inhib][phospho][self.stimuli[stim_index]][time_index]= np.nan else: results[inhib][phospho][self.stimuli[stim_index]][time_index] = float(datum) return results
[docs] def read_prediction_insilico(self, filename): """Reads true predcition from the 20 CSV files""" results = {} zipdata = zipfile.ZipFile(filename) for inhib in self.inhibitors: expected_ending = "-%s-Prediction-Insilico.csv" % inhib filename = [x for x in zipdata.namelist() if x.endswith(expected_ending) and "MACOS" not in x] if len(filename) != 1: self.error("Could not find file ending in %s" % expected_ending) break filename = filename[0] # initialise dictionary for the inhibitor results[inhib] = {} # read data data = zipdata.read(filename).decode() data = data.splitlines() # get header header = data[0] header = [x.strip() for x in header.split(",")] stimuli = [x.split(":")[1] for x in header if x.endswith("Stimuli")] stimuli = [x.replace("+", "_") for x in stimuli] # some users put "+" instead of "_" ... phosphos = [x.split(":")[1] for x in header if x.startswith("DV:")] if len(phosphos) != 20: self.error("Number of expected phosphos (20 )is incorrect (Found %s in %s) " %(len(phosphos), filename)) continue if len(stimuli) != 8: self.error("Number of expected stimuli (8) is incorrect (Found %s in %s) " %(len(stimuli), filename)) continue for s in stimuli: if s not in self.stimuli: self.error("found incorrect stimuli (%s) in %s" % (s, filename)) continue # create the key to hold all phosphos for i,phospho in enumerate(phosphos): results[inhib][phospho] = {} # create the key to hold all ligans for stimulus in self.stimuli: results[inhib][phospho][stimulus] = [[] for i in range(10)] else: pass # now scan the data # skip header for i, row in enumerate(data[1:]): row = [x.strip() for x in row.split(",")] these_stimuli = [int(float(x)) for x in row[1:9]] stim_index = [i for i, stim in enumerate(these_stimuli) if stim == 1 ] #assert len(stim_index)==0, "no stimuli found on line %s in %s" % (i, filename) assert len(stim_index)==1, "more than 1 stimuli on line %s in %s" % (i, filename) stim_index = stim_index[0] # get time and correspoding index in the list of valid times thistime = float(row[9]) time_index = self.times.index(thistime) # append the data to the relevant phospho index for i, phospho in enumerate(phosphos): # ignore the indices related to FOX, TAX, mTor datum = row[i+1+8+1] # +cell line +1 time + 8 stimuli if datum == "NA" or datum =='?': results[inhib][phospho][stimuli[stim_index]][time_index]= np.nan else: results[inhib][phospho][stimuli[stim_index]][time_index] = float(datum) return results
[docs] def get_rmse(self, inhibitor, phospho): """ .. warning:: x in converted into a log2 scale """ RMSE = 0 #T = 7. #S = 8. counter = 0 for s in self.stimuli: true = np.array(self.true_prediction[inhibitor][phospho][s]) user = np.array(self.user_prediction[inhibitor][phospho][s]) data = [x for x in (np.log2(true)-np.log2(user))**2 if x!=np.inf] N = len([x for x in data if np.isnan(x) == False]) #N = len(data) try: import bottleneck as bn RMSE += bn.nansum(data) except: RMSE += np.nansum(data) counter += N # TODO: if nans are include, do we need to change T ? if counter != 0: RMSE /= float(counter) RMSE = np.sqrt(RMSE) else: RMSE = np.nan return RMSE
[docs] def compute_all_rmse(self, null=False): self.rmse = {} for c in self.inhibitors: if c == "AB9": continue self.rmse[c] = {} for l in self.phosphos: if l==c: continue if l in ["AB3", "AB13", "AB18"] or c in ["AB3", "AB13", "AB18"]: continue # those nodes have been ignored in the official leaderboards # based on the participants submissions. # To get the same results for a new participants, those nodes # should be keep the same. Although, we a entirely new set of # participants, those should be updated. if null is True: dummy_nodes = [] else: #computed using 100,000 null distributions dummy_nodes = [(2,1), (6,1),(7,1), (6,4),(2,5), (6,5),(16,6), (16,7), (6,8), (16,8),(6,9),(7,9),(16,9),(20,9), (6,10),(6,12),(2,14),(6,14), (7,14), (2,15), (16,15), (5,16), (5,17), (5,19), (16,19)] if self.version == 2: dummy_nodes = [(16,4), (16,14)] stop = False for node in dummy_nodes: n1 = "AB" + str(node[0]) n2 = "AB" + str(node[1]) if c == n1 and l == n2: stop = True if stop: continue if l != c: rmse = self.get_rmse(c, l) self.rmse[c][l] = rmse
[docs] def get_training_data(self): self.training = {} filenames = self.getpath_data("MD_insilico.csv") for filename in filenames: if self.verbose: print("Scanning %s" % filename) data_iter = csv.reader(open(filename, "r"), delimiter=",") # get the header header = data_iter.next() # There should be 56 lines exacly with the 8 stimuli times 7 times # for all phosphos if header[0] != "TR:inSilico:CellLine": self.error("Error in the header of %s (first row)" % filename ) stimuli = [x.split(":")[1] for x in header[1:8]] #inhibitors = [] if header[8] != "DA:ALL": self.error("Expected a DA:ALL column in the header at column 9, which was not found") phosphos = [x.split(":")[1] for x in header[9:]] self.training = {} for phospho in phosphos: self.training[phospho] = [] for iter, row in enumerate(data_iter): # append all data for a given phosphos over all stims, # inhibitor+DMSO and times for ip, phospho in enumerate(phosphos): datum = row[7+2+ip] if datum == "NA": datum = np.nan else: datum = float(datum) self.training[phospho].append(datum)
[docs] def get_null(self, N=100, tag="sc2b"): self.get_training_data() all_rmses = [] for i in range(0, N): print(tag + " " + str(i)) self.user_prediction = self.create_random_data() self.compute_all_rmse(null=True) all_rmses.append(copy.deepcopy(self.rmse)) return all_rmses
[docs] def create_random_data(self): """ Here, we don't want the true prediction that contains only what is requested (AZD8055) but the orignal training data with 2 or 3 inhibitors such as GSK and PD17 so that we can shuffle them. We want to select for a given cell line and phosphos a data set to fill at a given time. The datum is slected accross the 8 stimuli, inhibitors +DMSO, and time points. """ import random #data = copy.deepcopy(self.true_prediction) data = {} for c in self.training.keys(): data[c] = {} for p in self.training.keys(): data[c][p] = {} for s in self.true_prediction[c][p].keys(): # select randomly for 10 time points for each # inhibitor (8) (no 45 minute point). data[c][p][s] = random.sample(self.training[c], 10)[:] return data
def _get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 32 combi from a null AUC distribution""" filename = 'sc2b_null_mu_sigma.json' filename = self.getpath_data(filename) # a dict contain dict (phospho) of dict (phospho) of dict (mu,sigma) with open(filename) as fh: mu_sigma = json.load(fh) mu = {} sigma = {} for c in self.phosphos: mu[c] = {} sigma[c] = {} for p in self.phosphos: if p == c: mu[c][p] = np.nan sigma[c][p] = np.nan else: mu[c][p] = mu_sigma[c][p]['mu'] sigma[c][p] = mu_sigma[c][p]['sigma'] return mu, sigma
[docs] def get_mean_zscores(self): zscore = self.get_zscores(self.rmse) data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()] mu = np.mean([x for x in data if np.isnan(x)==False]) return mu
[docs] def get_zscores(self, rmses=None): mu, sigma = self._get_mean_and_sigma_null_parameters() zscores = {} for c in self.inhibitors: zscores[c] = {} for p in self.phosphos: if c!=p: if rmses!=None: zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p] else: zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p] else: zscores[c][p] = np.nan return zscores
[docs] def get_mean_rmse(self): d = self.rmse return np.mean([d[k1][k2] for k1 in d.keys() for k2 in d[k1].keys() if np.isnan(d[k1][k2])==False])
[docs]class HPNScoringPrediction_ranking(HPNScoring): """This class is used to compute the ranks of the different participants based on an average rank over the 4 cell lines times phosphos :: s = HPNScoringPrediction(filename="file1zip") s.compute_all_rmes() r = HPNScoringPrediction_ranking() # s.aucs is a list where each element is a dictionary of r.add_rme(s1.rmse, "team1") rmse1 = r.get_randomised_rmse(r.rmse[0], sigma=1) rmse2 = r.get_randomised_rmse(r.rmse[0], sigma=2) rmse3 = r.get_randomised_rmse(r.rmse[0], sigma=3) r.add_rmse(rmse1, "team2") r.add_rmse(rmse2, "team3") r.add_rmse(rmse3, "team4") sall.add_rmse(s2.rmse, "team2") sall.get_ranking() {'team1': 1.96875, 'team2': 1.03125} This class is independent of HPNSCoringPrediction. However, it takes as input the returned values of HPNScoringPrediction.compute_all_rmse() """ def __init__(self): super(HPNScoringPrediction_ranking, self).__init__() self.valid_cellLines = ['UACC812', 'MCF7', 'BT549', 'BT20'] self.rmse = [] self.participants = [] def _get_empty_rmse(self): data = {} for c in self.valid_cellLines: data[c] = {} for s in self.species[c]: data[c][s] = np.inf return data
[docs] def add_rmse(self, data, participant_id): if participant_id in self.participants: raise ValueError("participant already added") if data == None: data = self._get_empty_rmse() self.rmse.append(copy.deepcopy(data)) self.participants.append(participant_id) self._compute_ranks()
def _get_species(self): if len(self.rmse)>=1: species = {} for c in self.valid_cellLines: species[c] = self.rmse[0][c].keys() return species else: print("you must provide a rmse dict at least to get the species") species = property(_get_species) def _rank_index(self, vector): sorted_vector = sorted(vector) return [sorted_vector.index(x) for x in vector] def _compute_ranks(self): ranks = {} for c in self.valid_cellLines: ranks[c] = {} for l in self.species[c]: # get this RMSE for all participant data = [x[c][l] for x in self.rmse] # but use a reverse order hence 1-x for the sorting indices = np.argsort([x for x in data]) ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))] self.ranks = copy.deepcopy(ranks)
[docs] def get_ranking(self): ranking = {} for k in self.participants: ranks = self.get_rank_participant(k) mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()]) ranking[k] = mu return ranking
[docs] def get_mean_ranks(self): return self.get_ranking()
[docs] def get_integer_ranks(self): ranking = self.get_ranking() integer_ranks = {} # get teams and ranks teams_ranks = list(ranking.items()) teams = [x[0] for x in teams_ranks] mean_ranks = [x[1] for x in teams_ranks] for i, index in enumerate(np.argsort(mean_ranks)): integer_ranks[teams[int(index)]] = i+1 return integer_ranks
[docs] def get_rank_participant(self, participant): ranks = {} if participant in self.participants: index = self.participants.index(participant) for c in self.valid_cellLines: ranks[c] = {} for l in self.species[c]: ranks[c][l] = self.ranks[c][l][index] return ranks else: raise ValueError("unknown participant")
[docs] def get_randomised_rmse(self, rmse, sigma=1): """THis is useful for testing. See class documentaton""" new_rmse = copy.deepcopy(rmse) for c in self.valid_cellLines: for l in self.species[c]: x = new_rmse[c][l] new_rmse[c][l] = np.uniform(-1 * x, x*2) + x # new values is between 0 and sigma*x #new_rmse[c][l] = x + sigma + x # new values is between 0 and sigma*x return new_rmse
# Duplicated from HPNScoringPrediction def _get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 4 celllines and phosphos""" filename = 'sc2a_null_mu_sigma_new.json' filename = self.getpath_data(filename) with open(filename) as fh: mu_sigma = json.load(fh) mu = {} sigma = {} for c in self.valid_cellLines: mu[c] = {} sigma[c] = {} for p in self.valid_phosphos[c]: mu[c][p] = mu_sigma[c][p]['mu'] sigma[c][p] = mu_sigma[c][p]['sigma'] return mu, sigma # Duplicated from HPNScoringPrediction
[docs] def get_mean_zscores(self): zscores = {} for i,k in enumerate(self.participants): zscore = self._get_zscores(self.rmse[i]) data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()] data = [x for x in data if np.isnan(x)==False] mu = np.mean(data) zscores[k] = mu return zscores
# Duplicated from HPNScoringPrediction def _get_zscores(self, rmses=None): mu, sigma = self._get_mean_and_sigma_null_parameters() zscores = {} for c in self.valid_cellLines: zscores[c] = {} for p in self.valid_phosphos[c]: if rmses!=None: zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p] else: zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p] return zscores def _get_valid_phosphos(self): rmse = None for x in self.rmse: if x != None: rmse = x break if rmse == None: raise ValueError("No RMSE found. please add one (not None) using add_rmse") valid_phosphos = {} for c in rmse.keys(): valid_phosphos[c] = rmse[c].keys() return valid_phosphos valid_phosphos = property(_get_valid_phosphos)
[docs]class HPNScoringPredictionInsilico_ranking(HPNScoring): """This class is used to compute the ranks of the different participants based on an average rank over the 4 cell lines times phosphos :: s = HPNScoringPredictionInsilico(filename="file1zip") s.compute_all_rmes() r = HPNScoringPrediction_ranking() # s.aucs is a list where each element is a dictionary of r.add_rme(s1.rmse, "team1") rmse1 = r.get_randomised_rmse(r.rmse[0], sigma=1) rmse2 = r.get_randomised_rmse(r.rmse[0], sigma=2) rmse3 = r.get_randomised_rmse(r.rmse[0], sigma=3) r.add_rmse(rmse1, "team2") r.add_rmse(rmse2, "team3") r.add_rmse(rmse3, "team4") sall.add_rmse(s2.rmse, "team2") sall.get_ranking() {'team1': 1.96875, 'team2': 1.03125} This class is independent of HPNSCoringPrediction. However, it takes as input the returned values of HPNScoringPrediction.compute_all_rmse() """ def __init__(self): super(HPNScoringPredictionInsilico_ranking, self).__init__() self.times = [0, 1,2,4,6,10,15,30,60,120] self.inhibitors = ["AB%s"%x for x in range(1,21)] self.phosphos = ["AB%s"%x for x in range(1,21)] self.ignore = ['AB3', 'AB13', 'AB18'] for this in self.ignore: self.inhibitors.remove(this) self.phosphos.remove(this) self.inhibitors.remove("AB9") self.rmse = [] self.participants = [] def _get_empty_rmse(self): data = {} for c in self.inhibitors: data[c] = {} for s in self.phosphos: data[c][s] = np.inf return data
[docs] def add_rmse(self, data, participant_id): if participant_id in self.participants: raise ValueError("participant already added") if data == None: data = self._get_empty_rmse() self.rmse.append(copy.deepcopy(data)) self.participants.append(participant_id) self._compute_ranks()
def _rank_index(self, vector): sorted_vector = sorted(vector) return [sorted_vector.index(x) for x in vector] def _compute_ranks(self): ranks = {} for c in self.inhibitors: ranks[c] = {} phosphos = self.rmse[0][c].keys() for l in phosphos: if l!=c: # get this RMSE for all participant data = [x[c][l] for x in self.rmse] # but use a reverse order hence 1-x for the sorting indices = np.argsort([x for x in data]) ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))] self.ranks = copy.deepcopy(ranks)
[docs] def get_ranking(self): ranking = {} for k in self.participants: ranks = self.get_rank_participant(k) mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()]) ranking[k] = mu return ranking
[docs] def get_mean_ranks(self): return self.get_ranking()
[docs] def get_integer_ranks(self): ranking = self.get_ranking() integer_ranks = {} # get teams and ranks teams_ranks = list(ranking.items()) teams = [x[0] for x in teams_ranks] mean_ranks = [x[1] for x in teams_ranks] for i, index in enumerate(np.argsort(mean_ranks)): integer_ranks[teams[int(index)]] = i+1 return integer_ranks
[docs] def get_rank_participant(self, participant): ranks = {} if participant in self.participants: index = self.participants.index(participant) for c in self.inhibitors: ranks[c] = {} phosphos = self.rmse[0][c].keys() for l in phosphos: if c!=l: ranks[c][l] = self.ranks[c][l][index] return ranks else: raise ValueError("unknown participant")
[docs] def get_randomised_rmse(self, rmse, sigma=1): """THis is useful for testing. See class documentaton""" new_rmse = copy.deepcopy(rmse) for c in self.inhibitors: phosphos = self.rmse[0][c].keys() for l in phosphos: if c!=l: x = new_rmse[c][l] new_rmse[c][l] = np.uniform(-1 * x, x*sigma) + x # new values is between 0 and sigma*x #new_rmse[c][l] = x + sigma + x # new values is between 0 and sigma*x return new_rmse
[docs] def get_mean_zscores(self): zscores = {} for i,k in enumerate(self.participants): zscore = self._get_zscores(self.rmse[i]) data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()] data = [x for x in data if np.isnan(x)==False] mu = np.mean(data) zscores[k] = mu return zscores
def _get_mean_and_sigma_null_parameters(self): """Retrieve mean and sigma for 32 combi from a null AUC distribution""" filename = 'sc2b_null_mu_sigma.json' filename = self.getpath_data(filename) # a dict contain dict (phospho) of dict (phospho) of dict (mu,sigma) with open(filename) as fh: mu_sigma = json.load(fh) mu = {} sigma = {} for c in self.inhibitors: mu[c] = {} sigma[c] = {} phosphos = self.rmse[0][c].keys() for p in phosphos: if p == c: mu[c][p] = np.nan sigma[c][p] = np.nan else: mu[c][p] = mu_sigma[c][p]['mu'] sigma[c][p] = mu_sigma[c][p]['sigma'] return mu, sigma def _get_zscores(self, rmses=None): mu, sigma = self._get_mean_and_sigma_null_parameters() zscores = {} for c in self.inhibitors: zscores[c] = {} phosphos = self.rmse[0][c].keys() for p in phosphos: if c!=p: if rmses!=None: zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p] else: zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p] else: zscores[c][p] = np.nan return zscores
def sc2a_null(N=100, tag=""): s = HPNScoringPrediction() res = s.get_null(N, tag=tag) return res def sc2b_null(N=100): s = HPNScoringPredictionInsilico() res = s.get_null(N) return res