Source code for dreamtools.dream8.D8C1.scoring

#-*- python -*-
# -*- coding: utf-8 -*-
#
#  This file is part of DREAMTools software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: http://github.com/dreamtools
#
##############################################################################
"""This module provides utilities to compute scores related to HPN-Dream8

It can be used and should be used indepently of Synapse altough for testing,
data sets may be downloading from synapse if you don't have any local files to
play with.

Here is an example related to the Network subchallenge::

    >>> from dreamtools.dream8.D8C1 import scoring
    >>> s = scoring.HPNScoringNetwork()
    >>> s.compute_all_descendant_matrices()
    >>> s.compute_all_rocs()
    >>> s.compute_all_aucs()


https://www.synapse.org/#!Synapse:syn1720047/wiki/60530



"""
import csv
import os
import copy
import json
import zipfile

import pylab
import numpy as np

from dreamtools.core import rocs
from dreamtools.dream8.D8C1 import cython_scoring
from dreamtools.core.ziptools import ZIP
from dreamtools.core.rocs import ROC
from dreamtools.core.challenge import Challenge, LocalData
from dreamtools.dream8.D8C1 import commons


__all__ = ["HPNScoringNetwork", "HPNScoring", "HPNScoringNetworkInsilico",
    "HPNScoringNetwork_ranking", "HPNScoringPrediction",
    "HPNScoringPrediction_ranking", "ScoringError",
    "HPNScoringPredictionInsilico_ranking",
    "HPNScoringPredictionInsilico", "D8C1"]


[docs]class D8C1(Challenge):
    """Factory for the D8C1 (HPN-Breast challenge)"""

    def __init__(self, version=2, verbose=True, download=True, **kargs):
        super(D8C1, self).__init__('D8C1', verbose, download, **kargs)

        self.sub_challenges = ['SC1A', 'SC1B', 'SC2A', 'SC2B']
        self._init()
        self.version = version
        # download (_init) only on request so that --info and --onweb 
        # can run without connection to synapse

    def _init(self):
        if self._standalone is True:
            return
        self._download_data('experimental.zip', 'syn1920412')
        self._download_data('alphabeta-Network.zip', 'syn4939071')
        self._download_data('alphabeta-Network-Insilico.zip', 'syn4939076')
        self._download_data('alphabeta-Prediction.zip', 'syn4939077')
        self._download_data('alphabeta-Prediction-Insilico.zip', 'syn4939078')
        self._download_data('TruePrediction.zip', 'syn4939080')
        self._download_data('TruePredictionInsilico.zip', 'syn4939081')
        self._download_data('TruePredictionInsilico2.zip', 'syn4939079')
        self._download_data('TrueDescVectors.zip', 'syn4939082')

[docs]    def download_template(self, subname):
        if subname == 'SC1A':
            filename = self.get_pathname('alphabeta-Network.zip')
        elif subname == 'SC1B':
            filename = self.get_pathname('alphabeta-Network-Insilico.zip')
        elif subname == 'SC2A':
            filename = self.get_pathname('alphabeta-Prediction.zip')
        elif subname == 'SC2B':
            filename = self.get_pathname('alphabeta-Prediction-Insilico.zip')
        else:
            raise ValueError('Invalid name. Use one of %s' %
                    self.sub_challenges)
        return filename

[docs]    def download_goldstandard(self, subname):
        if subname == 'SC1A':
            filename = self.get_pathname('TrueDescVectors.zip')
        elif subname == 'SC1B':
            filename = self.getpath_gs("TrueGraph.csv")
        elif subname == 'SC2A':
            filename = self.get_pathname('TruePrediction.zip')
        elif subname == 'SC2B':
            filename = self.get_pathname('TruePredictionInsilico2.zip')
        else:
            raise ValueError('Invalid name. Use one of %s' %
                    self.sub_challenges)
        return filename

[docs]    def score(self, filename, subname=None):
        self._init()

        if subname == 'SC1A':
            s = HPNScoringNetwork(filename)
            s.compute_all_aucs()
            auc = s.get_auc_final_scoring()
            return {'meanAUROC': auc}
        elif subname == 'SC1B':
            s = HPNScoringNetworkInsilico(filename)
            s.compute_score()
            return {'meanAUROC': s.auc}
        elif subname == 'SC2A':
            s = HPNScoringPrediction(filename, version=self.version)
            s.compute_all_rmse()
            return {'meanRMSE': s.get_mean_rmse()}
        elif subname == 'SC2B':
            s = HPNScoringPredictionInsilico(filename, version=self.version)
            s.compute_all_rmse()
            return {'meanRMSE': s.get_mean_rmse()}
        else:
            raise ValueError('Invalid name. Use one of %s. You provided %s.'
                    % (self.sub_challenges, subname))


[docs]class ScoringError(Exception):
    """An exception class for scoring classes"""
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr("ScoringError(HPN-DREAM8): %s " % self.value )


[docs]class HPNScoring( ZIP, LocalData):
    """Base class common to all scoring classes

    The HPN challenges use data from 32 types of combinaison of cell lines (4) 
    and ligands (8). This class provides aliases to:

     * valid cell lines (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_cellLines`)
     * valid ligands (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_ligands`)
     * expected length of the vectors for each cell line
       (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.valid_length`)
     * indices of the row vectors containing the mTOR species within the
       descendancy matrices
       (:attr:`~dreamtools.dream8.D8C1.scoring.HPNScoring.mTor_index`)

    .. note:: all matrices and vectors are sorted according to a hard-coded
        list of species for a combinaison of cell line and ligand. The species
        are indeed sorted alphabetically following hhe same order as in the
        original CSV files containing the data sets.

    In addition, it the :attr:`score` attributes can be used
    to store the score computed by :meth:`compute_score` .

    All classes that need to compute scores require a data file
    submitted by a participant. We enforce the usage of ZIP file, which can
    be loaded by using :meth:`loadZIPFile`.

    """

    def __init__(self, verbose=True):
        super(HPNScoring, self).__init__()
        LocalData.__init__(self)
        #: List of valid cell lines (e.g, BT20)
        self.valid_cellLines =  commons.cellLines
        #: List of valid ligands (e.g, EGF)
        self.valid_ligands = commons.stimuli
        #: length of the vectors to be found within each cell line
        # ignoring TAZ_p89 and FOXO3a_pS318
        # sometimes, like in the true descendants, the TAZ and FOX are inlcude
        # but set to None, which may be confusing
        self.valid_length = {'BT549': 44, 'MCF7': 39, 'UACC812': 44, 'BT20': 46}
        self.valid_length_extended = {'BT549': 45, 'MCF7': 41, 'UACC812': 45,
                                      'BT20': 48}

        #: indices of the mTOR species in the different cell lines within
        # the true descendants vectors assuming length are 44, 39, 44, 46 that
        # is excludnig  the taz and fox phosphos
        self.mTor_index = {'BT20': 23, 'UACC812': 21, 'MCF7':21, 'BT549':21}
        #indices are computed using:

        self.species_to_ignore = {
            'MCF7': ['TAZ_pS89', 'FOXO3a_pS318_S321'],
            'BT20': ['TAZ_pS89', 'FOXO3a_pS318_S321'],
            'BT549': ['TAZ_pS89'],
            'UACC812': ['TAZ_pS89']
            }

        # We need to retrieve list of phosphos as coded in the 
        # experimental data sets .
        self.cellLines_names_steven = ['UACC812', 'BT549', 'MCF7', 'BT20']
        self.ligands_names_steven = ['Serum', 'PBS', 'EGF', 'Insulin', 
                'FGF1', 'HGF', 'NRG1', 'IGF1']
        self._score = None
        self.verbose = verbose

    def _get_score(self):
        return self._score
    def _set_score(self, value):
        assert value>=0  and value<=1
        self._score = value
    score = property(_get_score, _set_score,
        doc="R/W attribute to store the score (in [0,1] only)")

[docs]    def error(self, message):
        """If you want to raise an error, use this method.

        It raises a ScoringException and set the :attr:`exception`
        attribute. The message is stored in exception.value
        If called, the :attr:`
        ` is set to "INVALID" and
        the score is set to 1 (worst score).

        """
        print("ERROR:"+ message)
        ScoringError(message)

[docs]    def load_species(self):
        """Loads names of the expected phospho names for each cell 
        line from the synapse files provided to the users"""
        # need to refactor that...
        c = Challenge('D8C1')
        filename = c.get_pathname('experimental.zip')

        z = zipfile.ZipFile(filename)
        self.species = {}
        for cell in self.valid_cellLines:
            #print("experimental/MIDAS/MD_%s_main.csv" % cell)
            ## in py3, z.read returns a bytes hence the str cast
            thisdata = z.read("experimental/MIDAS/MD_%s_main.csv" % cell).decode()
            header = thisdata.splitlines()[0]

            phosphos = [x.split(":")[1] for x in header.split(",") if x.startswith("DV")]

            phosphos = [x for x in phosphos if x not in self.species_to_ignore[cell]]
            assert len(phosphos) == self.valid_length[cell], "wrong length (%s) for cell %s " % (len(phosphos), cell)
            self.species[cell] = phosphos


class HPNScoringNetworkBase(HPNScoring):
    test_synapse_id = "syn1971273"
    true_synapse_id = "syn1971278"

    def __init__(self, filename=None, verbose=True):
        HPNScoring.__init__(self, verbose=verbose)
        self.filename = filename
        self.load_species()

    def _validate_eda(self, filename):

        if self.verbose: print("Validating %s" % filename),
        f = self.zip_data.open(filename)
        reader = csv.reader(f)
        reader.next() # skip the header
        f.close()
        if self.verbose: print("ok")

    def _str_cleanup(self, text):
        # required to cleanup some submissions
        text = text.replace("\r", "\n")
        text = text.replace("\n\n", "\n")
        text.strip()
        return text

    def get_eda_file(self, filename):
        if self.verbose:print("Loading EDA scores from %s" % filename)
        # read the data and stores it into vectors
        #index = self.zip_filenames.index(filename)
        #tail = os.path.split(self.zip_filenames[index])[-1]
        zipdata = self.zip_data.read(filename).decode()

        # prevent issue with NULL byte from Boston team
        data_iter = [x.replace("\x00", "") for x in zipdata.splitlines()]
        data_iter = [x.replace("\t", " ") for x in data_iter]
        data_iter = [self._str_cleanup(x) for x in data_iter if len(x)]

        if len(data_iter) == 1 and "\n" in data_iter[0]:
            data_iter = data_iter[0].split("\n")
        # skip header:
        if "EdgeScore" in data_iter[0]:
            del data_iter[0]

        data = []
        for i, datum in enumerate(data_iter):

            datum = datum.replace("=", " = ").replace("("," (").replace(")", ") ")

            datum = datum.split()
            if len(datum) == 5:
                data.append(datum)
            else:
                ValueError("EDA file (%s) should be made of 5 columns" % filename)
        node1 = [x[0] for x in data]
        node2 = [x[2] for x in data]
        edges = [x[1] for x in data]
        scores = [x[4] for x in data]
        return node1, node2, edges, scores


[docs]class HPNScoringNetwork(HPNScoringNetworkBase):
    """Class to compute the score of a Network submission

    A user will provide a ZIP file that contains 65 files: 32 EDA,
    The 32 files should be tagged with the 32 combos of cell
    lines and ligands. To create an instance of HPNScoringNetwork, type::

        s = HPNScoringNetwork("TeamName-Network.zip")
        # or later
        s = HPNScoringNetwork()
        s.load_submission("TeamName-Network.zip")

        s.get_auc_final_scoring() # as in the challenge ignoring some regimes


    You then need to specifically load the EDA files. This may be done
    with :meth:`load_all_eda_files_from_zip`::

        s.load_all_eda_files_from_zip()

    The content of the ZIP file can be validated using the :meth:`validation`
    method.::

        s.validation()

    Each EDA and SIF file must be a complete graph where all species correspond to
    the CSV files provided on the synapse web page. The size of the network
    varies depending on the cell line.

    Each EDA file that contains score on each edge and first needs to be transformed into
    a descendancy matrix. This is achieve via :meth:`compute_descendant_matrix`
    and/or :meth:`compute_descendant_matrix` methods.::

        s.compute_all_descendant_matrices()

    From each matrix, we'd like to compare a specific row (corresponding to
    mTOR) to the true scores that are expected. The true descendant for each
    combinaison of cell line and ligand are provided and loaded in the
    constructor via :meth:`load_true_descendants_from_zip`, which can be called at
    any time.

    """
    test_synapse_id = "syn1971273"
    true_synapse_id = "syn1971278"

    def __init__(self, filename=None, verbose=False, true_descendants=None):
        """

        :param str filename:

        """
        super(HPNScoringNetwork, self).__init__(filename, verbose=verbose)
        self._init()
        if filename:
            self.load_submission(self.filename)
        self.robustness_testing = False
        self.masking = 0.1
        if true_descendants is None:
            self._load_true_descendants_from_zip()
        else:
            self.true_descendants = copy.deepcopy(true_descendants)

[docs]    def load_submission(self, filename):
        self.loadZIPFile(filename)
        self.load_all_eda_files_from_zip()

    def _init(self):
        """Creates attributes to store edge scores from 32 EDA files"""
        self.edge_scores = dict([(x,{}) for x in self.valid_cellLines])
        #self.species = dict([(x,{}) for x in self.valid_cellLines])
        self.descendancy_matrices = dict([(x,{}) for x in self.valid_cellLines])
        self.roc = dict([(x,{}) for x in self.valid_cellLines])
        self.auc = dict([(x,{}) for x in self.valid_cellLines])
        self.aupr = dict([(x,{}) for x in self.valid_cellLines])

[docs]    def edge_score_to_eda_files(self, teamname):
        directory = teamname
        try:os.mkdir(directory)
        except:pass
        for k1 in self.edge_scores.keys():
            species = self.species[k1]
            N = len(species)
            for k2 in self.edge_scores[k1].keys():
                filename = teamname + "-" +  k1 + "-" + k2 + "-Network.eda"
                fh = open(directory + os.sep + filename, "w")
                assert self.edge_scores[k1][k2].shape == (N, N)
                for i in range(0, N):
                    for j in range(0, N):
                        name1 = species[i]
                        name2 = species[j]
                        value = self.edge_scores[k1][k2][i][j]
                        fh.write("{} (1) {} = {}\n".format(name1, name2,value))
                fh.close()

[docs]    def validation(self):
        """General validation

            * Check that there are 32 EDA files
            * For each EDA file, calls further check
                * format of the filename (correct cell line and ligand names)
                * format of the dataa = character with a RHS and LHS
                * LHS is made of 3 elements
                * skip the header

        """
        # check that the number of files are correct.

        if self.verbose:print("Checking number of files and extensions..."),
        extensions = [os.path.splitext(x)[1] for x in self.zip_filenames if "MACOSX" not in x]

        # check that the contents of the SIF are 3 columns
        if self.verbose:
            print("ok\nChecking filename and contents of SIF files"),

        # check that the contents of the EDA files are correct.
        if self.verbose:
            print("All SIF files seem ok\nChecking filename and contents of EDA files"),
        for filename in self.zip_filenames:
            if filename.endswith(".eda"):
                self._check_filename(filename)
                try:
                    self._validate_eda(filename)
                except:
                    pass
        if self.verbose:print("All EDA files seem ok")

    def _check_filename(self, filename):
        filename = os.path.split(filename)[1]
        if len(filename)==0:
            return
        try:
            team, cellLine, ligand, name = filename.split("-")
            if cellLine not in self.valid_cellLines:
                self.error(" * filename %s ill-formed. unrecognised Cell Line. must be in %s " % filename, self.valid_cellLines )
            if ligand not in self.valid_ligands:
                self.error(" * filename %s ill-formed. unrecognised ligand. must be in %s " % filename, self.valid_ligands)
        except ValueError:
            self.error(" * filename %s is ill formed. Expects TeamName-CellLine-Ligand-Network.eda" % filename)

[docs]    def compute_score(self, validation=True):
        """Computes the final score that is the average over the 32 AUCs

        This function compute the final score. First, il loads all EDA files
        provided by the  participany (from the ZIP file). Then, it computes
        the 32 descendant matrices. Finally, it computes the 32 ROCS and AUCS.
        The scores is for now based on the z-score. Since scores must be between
        0 and 1, where 0 is the best, we will need to normalise.

        :param bool validation: perform validation of the input ZIP file

        """
        # First let us load all EDA edge scores into 32 matrices in edge_scores
        # together with the correponsind species

        self.compute_all_aucs()
        if len(self.auc['BT20']):
            self.score = self.get_average_auc()

[docs]    def get_auc_final_scoring(self):
        """This function returns the mean AUC using only official ligands as
        used in final scoring and collaborative rounds.

        The individual AUCs must be computed first with
        :meth:`compute_all_aucs` or .::


            >>> s = scoring.HPNScoringNetwork(filename)
            >>> auc = s.get_auc_final_scoring()

        """
        self.compute_all_aucs()
        ligands = {'BT20': ['IGF1', 'PBS', 'Serum', 'NRG1', 'HGF', 'EGF',  'FGF1'],
            'BT549': ['IGF1', 'PBS', 'Serum', 'Insulin', 'HGF', 'EGF', 'FGF1'],
            'MCF7': ['IGF1', 'PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1'],
            'UACC812': ['PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1', 'IGF1']}
        auc = np.mean([self.auc[k1][k2] for k1 in self.auc.keys()
            for k2 in self.auc[k1].keys() if  k2 in ligands[k1]])
        return auc

    def _load_true_descendants_from_zip(self):
        """Reads true descendants provided by Steven 27 June 2013

        The ZIP file should contains 32 vectors that contains the score for the
        edge positive (1) or negative (0)

        This method stores the data in the :attr:`true_descendants`.

        """
        self.true_descendants = dict([(x,{}) for x in self.valid_cellLines])
        c = Challenge('D8C1')
        filename = c.get_pathname("TrueDescVectors.zip")

        zipdata = zipfile.ZipFile(filename)
        if self.verbose:
            print("Loading all true descendants data set (%s)" % filename)

        for filename in zipdata.namelist():
            try:
                teamName, cellLine, ligand = os.path.splitext(filename)[0].split("_")
                data = zipdata.read(filename).decode()
                data = data.splitlines()[0] # look at the  header
                data = [int(x) if x!="NaN" else None for x in data.strip().split(',')]
                self.true_descendants[cellLine][ligand] = data[:]
            except Exception as err:
                print("note: skipping %s in your zipped file" % filename)
                pass

[docs]    def load_all_eda_files_from_zip(self):
        """Loads all EDA file from a participant into :attr:`edge_scores`"""
        for filename in self.zip_filenames:
            if filename.endswith(".eda") and "__MACOS" not in filename:
                self.load_eda_file(filename)

        # Issue in github https://github.com/dreamtools/dreamtools/issues/16
        M = max([self.edge_scores[cell][l].max() for cell in
            self.cellLines_names_steven for l in self.ligands_names_steven])
        for cell in self.cellLines_names_steven:
            for lig in self.ligands_names_steven:
                self.edge_scores[cell][lig] /= float(M)

[docs]    def load_eda_file(self, filename, local=False):
        """Loads scores from one EDA file

        :param filename: here filename should be one of the filename to be
            found within the ZIP file! This is not a standard file system (See
            note).

        Input data is EDA format that is::

            A 1 B = 0.4
            A 1 C = 0.5

        It containts edges such that the final graph is complete and a matrix
        can be built with column1 as the rows and column2 as the columns.
        The values being tmade from the fifth column. Second and fourth are
        ignored.

        loaded data is stored in :attr:`data` as a numpy matrix.

        .. note:: to overwrite the input ZIP file, use :meth:`loadZIPFile`

        """
        if self.verbose:print("Loading EDA scores from %s" % filename),
        if local == True:
            tail = os.path.split(filename)[-1]
            data = open(filename)
            data_iter = csv.reader(data, delimiter=" ")
        else:
            # read the data and stores it into vectors
            index = self.zip_filenames.index(filename)
            tail = os.path.split(self.zip_filenames[index])[-1]
            zipdata = self.zip_data.read(filename).decode()
            data_iter = zipdata.splitlines()

        if len(data_iter) and "EdgeScore" in data_iter[0]:
            del data_iter[0]

        try:
            teamName, cellLine, ligand, tag = tail.split("-")
        except:
            self.error(" * filename %s is ill formed. Expects TeamName-CellLine-Ligand-Network.eda" % filename)
            return

        # skip first row
        #header = data_iter.next()
        data = []
        for i, datum in enumerate(data_iter):
            datum = datum.replace("=", " = ").replace("("," (").replace(")", ") ")

            datum = datum.split()
            if len(datum) == 5:
                data.append(datum)
            elif len(datum) == 0 or len(datum) == 1:
                pass
                #print("Found empty line in EDA file %s" %  filename)
            else:
                self.error("- Line %s in %s is ill formed (expected 3 or 5 columns): %s" % (i,filename, datum))
                break
        node1 = [x[0] for x in data]
        node2 = [x[2] for x in data]
        #edges = [x[1] for x in data]
        scores = [x[4] for x in data]

        N = self.valid_length[cellLine]
        species = self.species[cellLine]

        tempdata  = np.zeros((N, N))
        for x,y,z in zip(node1, node2, scores):
            if x in self.species_to_ignore[cellLine] or y in self.species_to_ignore[cellLine]:
                pass
            else:
                #update from july 16 . error in MIDAS and CSV PKC-delta should
                # have been PKC-delta_pS664 so some people use one or the other
                # similarly for PRKCD
                if x == "PKC-delta_pS664":
                    x = "PKC-delta"

                mapping = {'c-RAF_pS338': "c-Raf_pS338",
                            "YB-1_pS102": "YB-1_PS102",
                            "MEK1_pS217_pS221": "MEK1_pS217_S221",
                            "EGFR_pY922": "EGFR_pY992"
                        }
                for k,v in mapping.items():
                    if x == k:
                        print("Warning mapping required in the EDA")
                        x = v
                    if y == k:
                        y = v

                if "SRC_" in x:
                    x = x.replace("SRC_", "Src_")

                if "SRC_" in y:
                    y = y.replace("SRC_", "Src_")

                if y == "PKC-delta_pS664":
                    y = "PKC-delta"

                if "RB_" in x:
                    x = x.replace("RB_", "Rb_")

                if "RB_" in y:
                    y = y.replace("RB_", "Rb_")

                try:
                    i = species.index(x)
                except:
                    self.error("Invalid species(%s) found in the file %s " % (x, filename))
                    break
                try:
                    j = species.index(y)
                except:
                    self.error("Invalid species(%s) found in the file %s " % (y, filename))
                    break

                tempdata[i][j] = z
        self.edge_scores[cellLine][ligand] = abs(tempdata)
        M = self.edge_scores[cellLine][ligand].max()
        #if M>1:
        #    print("!!!!!!!!!!!!!!!!!!!!1, %s %s" % (cellLine, ligand))
        #    self.edge_scores[cellLine][ligand] /= float(M)

        # max is now computed across all networks
        # https://github.com/dreamtools/dreamtools/issues/16
        # self.edge_scores[cellLine][ligand] /= float(M)

[docs]    def compute_all_descendant_matrices(self):
        """Compute all descendancy matrices

        For each cell line and ligand, the matrix is stored
        in the :attr:`edge_scores` dictionary.

        .. seealso:: :meth:`compute_descendant_matrix`

        """
        for c in self.valid_cellLines:
            for l in self.valid_ligands:
                if self.verbose:print("Computing descendancy matrix for (%s, %s)" % (c,l))
                self.compute_descendant_matrix(c,l)

[docs]    def compute_descendant_matrix(self, cellLine, ligand):
        """Computes the descendancy matrix for a given cell line and ligand

        :param str cellLine: a valid cell line
        :param str ligand: a valid ligand


        .. note:: we use a cython module to conmpute the matrix. This function
            is the bottle neck of the entire procedure to compute the score.
            This is especially important to estimate te null distribution of
            the AUCs. Using Cython does not improve much the performance (80%)
            but it improves it...

        .. seealso:: :meth:`compute_all_descendant_matrices`
        """
        cython_scoring.compute_descendant_matrix(self, cellLine, ligand)

[docs]    def compute_metrics(self, cellLine, ligand):
        scores = self._get_scores(cellLine, ligand)
        classes = self._get_classes(cellLine, ligand)
        classes = [x for x in classes if x!=None]

        sorted_indices = np.argsort(1-scores)
        sorted_scores = [scores[i] for i in sorted_indices]
        sorted_classes = [classes[i] for i in sorted_indices]

        P = float(classes.count(1))
        N = float(classes.count(0))

        roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[],
            "recall":[], "Fmeasure":[], 'threshold':[]}

        for i in range(0, len(scores)):
            threshold = sorted_scores[i]
            TP = float(sorted_classes[0:i].count(1))
            FP = float(sorted_classes[0:i].count(0))
            TPR = TP/P
            FPR = FP/N
            precision = TP / (TP+FP)
            recall = TP / P
            if recall >0 and precision >0 :
                Fmeasure = 2./(1./precision+1./recall)
            else:
                Fmeasure = np.nan

            roc["threshold"].append(threshold)
            roc["FP"].append(FP)
            roc["TP"].append(TP)
            roc['tpr'].append(TPR)
            roc['fpr'].append(FPR)
            roc['precision'].append(precision)
            roc['recall'].append(recall)
            roc['Fmeasure'].append(Fmeasure)

        return roc

    def _compute_fpr_tpr(self, FP, TP, N, P):
        roc = {"fpr":[], "tpr":[], "FP":[], "TP":[]}
        roc['fpr'] = FP/float(N)
        roc['tpr'] = TP/float(P)
        roc['TP'] = TP
        roc['FP'] = FP
        return roc

[docs]    def compute_roc(self, cellLine, ligand):
        """Compute the ROC curve

        :param scores: list of scores (probabilities)
        :param classes: list of classes (true values)


        .. seealso:: :meth:`compute_roc`
        """
        scores = self.descendancy_matrices[cellLine][ligand][self.mTor_index[cellLine]]
        # remove the None otherwise classes and scores are different
        classes = [x for x in self.true_descendants[cellLine][ligand] if x!=None]

        if self.robustness_testing:
            N = len(scores)
            from random import sample
            indices = sample(range(0,N), int((1-self.masking)*N))
            scores = np.array([s for i,s in enumerate(scores) if i in
                indices])
            classes = [s for i,s in enumerate(classes) if i in indices]

        assert len(scores) ==  len(classes)
        sorted_indices = np.argsort(1-scores)
        sorted_scores = [scores[i] for i in sorted_indices]
        sorted_classes = [classes[i] for i in sorted_indices]
        P = float(classes.count(1))
        N = float(classes.count(0))
        FP = 0
        TP = 0
        #
        roc = {"fpr":[], "tpr":[], "precision":[], "FP":[], "TP":[],
            "recall":[], "Fmeasure":[], 'threshold':[]}
        fprev = 1.1 # scores are less than 1

        # an efficient algorithm to compute ROC based on T.Fawcett Pattern
        # Recognition Letters 27, 2006
        i = 0
        while i<len(scores):
            cls = sorted_classes[i]
            score = sorted_scores[i]

            if score != fprev and cls!=None:
                thisroc = self._compute_fpr_tpr(FP, TP, N, P)
                for key in ['fpr', 'tpr', "FP", "TP"]:
                    roc[key].append(thisroc[key])
                roc['threshold'].append(fprev)
                fprev = score

            if cls == 1:
                TP += 1
            elif cls == 0:
                FP += 1
            elif cls == None:
                pass
            else:
                raise ValueError("found a class value different from 0,1 or None")
            i += 1
        thisroc = self._compute_fpr_tpr(FP, TP, N, P)
        for key in ['fpr', 'tpr', "FP", "TP"]:
            roc[key].append(thisroc[key])
        roc['threshold'].append(sorted_scores[-1])

        roc = self.compute_other_metrics(roc)
        return roc

[docs]    def compute_other_metrics(self, roc):
        #Be aware that there is alway a first value TP=0,FP=0
        # this should be handled with care the recall/precision computatio
        # This first value is used to compute fpr and tpr but should not be used
        # here
        FP = np.array([float(x) for x in roc['FP']])
        TP = np.array([float(x) for x in roc['TP']])
        N = max(FP) + max(TP)

        # The first value of precision is undefined since FP+TP=0
        # It does not matter since it will be overwritten as explained here
        # below
        precision = TP / (FP+TP)
        recall = TP/float(max(TP))
        # The second value may also be be an issue. There are 2 cases.
        # 1. TP>0
        # 2. TP == 0
        # In the second case,  TP=0 and FP==0 should not happen. FP > 0 instead.
        # In which case, recall = 0 and precision=0. No ambiguity
        # In the first case, if TP>0 then precision >0 and recall > 0.
        # Therefore we end up without a defined value at recall==0
        # So we should set a recall=0 for which precision will be the same as
        # the one where recall >0
        # somehow, precision and recall
        if TP[1]>0:
            precision[0] = precision[1]
        else:
            precision[0] = 0

        roc['precision'] = [float(x) for x in precision]
        roc['recall'] = [float(x) for x in recall]
        return roc

[docs]    def compute_auc(self, roc):
        """Compute AUC given a ROC data set

        :param str roc: The roc data structure must be a dictionary with "tpr"
            key. Could be an variable returned by :meth:`compute_roc`.

        """
        import scipy.integrate
        value = scipy.integrate.trapz(roc['tpr'], roc['fpr'])
        return value

    def _get_scores(self, cellLine, ligand):
        """Returns scores of a given descendancy matrix taking care of mTor index"""
        scores = self.descendancy_matrices[cellLine][ligand][self.mTor_index[cellLine]]
        return scores

    def _get_classes(self, cellLine, ligand):
        classes = self.true_descendants[cellLine][ligand]
        return classes

[docs]    def compute_all_rocs(self):
        """Computes all ROC curves

        This function can be called once EDA files are loaded and all
        descendant matrices have been computed as well.

        .. seealso:: :meth:`load_all_eda_files_from_zip`,
            :meth:`compute_all_descendant_matrices`
        """
        for c in self.valid_cellLines:
            for l in self.valid_ligands:
                roc = self.compute_roc(c, l )
                self.roc[c][l] = roc.copy()

[docs]    def compute_all_aucs(self):
        """Computes all AUC

        This function can be called once EDA files are loaded and all
        descendant matrices have been computed as well.

        In theory, one should compute ROC and then AUC but this function
        recomputes ROC since it is fast to compute.

        .. seealso:: :meth:`load_all_eda_files_from_zip`,
            :meth:`compute_all_descendant_matrices`
        """
        self.validation()

        self.compute_all_descendant_matrices()

        for c in self.valid_cellLines:
            for l in self.valid_ligands:
                roc = self.compute_roc(c, l)
                self.roc[c][l] = roc.copy()
                auc = self.compute_auc(roc)
                self.auc[c][l] = auc

[docs]    def compute_all_auprs(self):
        for c in self.valid_cellLines:
            for l in self.valid_ligands:
                roc = self.compute_roc(c, l )
                self.roc[c][l] = roc.copy()
                aupr = self.compute_aupr(roc)
                self.aupr[c][l] = aupr

[docs]    def compute_all_metrics(self):
        for c in self.valid_cellLines:
            for l in self.valid_ligands:
                roc = self.compute_metrics(c, l)
                self.roc[c][l] = roc.copy()
                auc = self.compute_aupr(roc)
                self.aupr[c][l] = auc

[docs]    def compute_aupr(self, roc):
        import scipy.integrate
        value = scipy.integrate.trapz(roc['precision'], x=roc['recall'])
        return value

[docs]    def get_aucs(self):
        """Returns all AUCs"""
        values = [self.auc[k1][k2] for k1 in self.auc.keys() for k2 in self.auc[k1].keys()]
        return values

[docs]    def get_average_auc(self):
        """Returns mean of all AUCs"""
        values = self.get_aucs()
        return np.mean(values)

[docs]    def plot_roc(self, cellLine, ligand, hold=False):
        """Plots a psecific  ROC curve"""
        from pylab import plot, clf, grid, title

        x = self.roc[cellLine][ligand]['fpr']
        y = self.roc[cellLine][ligand]['tpr']
        auc = self.auc[cellLine][ligand]

        if hold == False:
            clf()
        else:
            from pylab import hold
            hold(True)

        plot(x, y, '-o')
        plot(x,x,'r')
        title("ROC for %s/%s (AUC=%s)" % (cellLine, ligand, auc))
        grid(True)

[docs]    def get_null_distribution(self, sample=100, cellLine="BT20", ligand="EGF",
             store_rocs=False, distr="uniform"):
        """Computes the null distribution for a given combinaison

          * Creates a uniformly distribution of a EDA file and stores
            it in the edge_score attribute.
          * recompute the corresponding descendancy matrix
          * Get the corresponding true prediction
          * compute the ROC and AUC

        :param int sample: number of distribution to compute
        :param cellLine:
        :param ligand:
        :param bool store_rocs: if set to True, save the rocs as well
        :return: rocs and aucs (rocs is set to [] for debugging)

        .. plot::
            :include-source:

            from dreamtools.dream8.D8C1 import scoring
            from pylab import clf, plot, hist, grid, pi, exp, sqrt, mean, std
            s = scoring.HPNScoringNetwork()
            rocs, aucs, auprs = s.get_null_distribution(100)
            mu = mean(aucs)
            sigma = std(aucs)
            clf()
            res = hist(aucs,50, normed=True)
            plot(res[1], 1/(sigma * sqrt(2 * pi)) * exp( - (res[1] - mu)**2 / (2 * sigma**2) ), linewidth=2, color='r')
            grid()

        """
        aucs = []
        #rocs = []
        auprs = []

        N = self.valid_length[cellLine]
        c = cellLine
        l = ligand
        for i in range(0, sample):
            if distr == "inverse":
                x = [float(x)/(N*N) for x in range(1, N*N + 1)]
                np.random.shuffle(x) # in line shuffling
                #reshape and create numpy array
                self.edge_scores[c][l] = np.array(x).reshape(N,N)
            elif distr == "uniform":
                self.edge_scores[c][l] = np.random.uniform(size=N*N).reshape(N,N)
            self.compute_descendant_matrix(c,l)
            roc = self.compute_roc(c, l)
            auc = self.compute_auc(roc)
            aupr = self.compute_aupr(roc)
            if store_rocs:
                rocs.append(roc)
            aucs.append(auc)
            auprs.append(aupr)
        return rocs, aucs, auprs

[docs]    def plot_all_rocs(self, cellLines=None):
        """Plots all 32 ROC once scores/rocs have been computed


        .. plot::
            :include-source:
            :width: 80%

            from pylab import clf, plot, hist, grid
            from dreamtools.dream8.D8C1 import scoring
            import os
            s = scoring.HPNScoringNetwork()
            from dreamtools import D8C1
            filename = D8C1().download_template('SC1A')
            s.load_submission(filename)
            s.compute_score()
            s.plot_all_rocs()

        """
        if cellLines is None:
            cellLines = self.valid_cellLines
        else:
            cellLines = [cellLines]

        pylab.clf()
        for c in cellLines:
            for l in self.valid_ligands:
                x = self.roc[c][l]['fpr']
                y = self.roc[c][l]['tpr']
                #auc = self.auc[c][l]
                pylab.plot(x, y, '-o', label="%s-%s" % (c,l))
                pylab.hold(True)

        pylab.title("32 ROCs")
        pylab.grid(True)
        pylab.plot(x, x, '--k', linewidth=2)

[docs]    def get_mean_zscores(self, aucs=None):
        zscores = self.get_zscores(aucs)
        mu = np.mean([zscores[k1][k2] for k1 in zscores.keys()
            for k2 in zscores[k1].keys()])
        return mu

[docs]    def get_zscores(self, aucs=None):
        """

        """
        mu, sigma = self.get_mean_and_sigma_null_parameters()
        zscores = {}
        for c in self.valid_cellLines:
            zscores[c] = {}
            for l in self.valid_ligands:
                if aucs:
                    zscores[c][l] = (aucs[c][l] - mu[c][l])/sigma[c][l]
                else:
                    zscores[c][l] = (self.auc[c][l] - mu[c][l])/sigma[c][l]
        return zscores

[docs]    def print_aucs(self):
        # the order here is defined to compare with steven results but could be
        # chnaged later on.
        self.compute_all_aucs()
        for c in self.cellLines_names_steven:
            #["UACC812","BT549","MCF7","BT20"]:
            print(c, "\t"),
            for l in self.ligands_names_steven:
                # ["Serum","PBS", "EGF", "Insulin", "FGF1", "HGF", "NRG1", "IGF1"]:
                roc = self.compute_roc(c, l)
                auc = self.compute_auc(roc)
                print('AUC',auc,"\t "),
            print()

[docs]    def get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 32 combi from a null AUC distribution"""
        import sc1a_tools
        null = sc1a_tools.AUCnull(self.valid_cellLines, self.valid_ligands, verbose=False)
        filename = 'sc1a_null_aucs_mean_sigma.json'
        null.loadaucs(filename)
        mean = null.get_mean_dict()
        sigma = null.get_sigma_dict()
        return mean, sigma


[docs]class HPNScoringNetwork_ranking(HPNScoring):
    """This class is used to compute the ranks of the different participants
    based on an average rank over the 32 combinaisons of cell line and ligands.

    ::

        s = HPNScoringNetwork(filename="file1zip")
        s.compute_all_aucs()

        sall = HPNScoringNetwork_all()
        # s.aucs is a list where each element is a dictionary of
        sall.add_auc(s1.auc, "team1")

        # let us build
        aucs2 = copy.deepcopy(s.auc)
        for c in s.valid_cellLines:
            for l in s.valid_ligands:
                auc2[c][l] = numpy.random.uniform(0.5,0.7)

        sall.add_auc(s2.auc, "team2")
        sall.get_ranking()
        {'team1': 1.96875, 'team2': 1.03125}

    This class is independent of HPNSCoringNetwork.
    However, it takes as input the returned values of
    HPNScoringNetwork.compute_all_auc()

    """
    def __init__(self):
        super(HPNScoringNetwork_ranking, self).__init__()
        self.aucs = []
        self.participants = []

        # we remove here the non-robust regime.
        # communication with steven the 4th Nov to inclde
        #  UACC812,IGF1 The 3 to be removed are:
        #  BT549,FGF1   BT549,NRG1   BT20,insulin

        # update from Nov 20th, BT549 FGF1 is now back into business

        self.valid_ligands_final = {
            'UACC812': ['PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1',  'IGF1'],
            'BT20': ['IGF1', 'PBS', 'Serum', 'NRG1', 'HGF', 'EGF', 'FGF1'],
            'BT549': ['IGF1', 'PBS', 'Serum',  'Insulin', 'HGF', 'EGF', 'FGF1'],
            'MCF7': ['IGF1', 'PBS', 'Serum', 'NRG1', 'Insulin', 'HGF', 'EGF', 'FGF1']
            }

[docs]    def get_empty_auc(self):
        auc = {}
        for c in self.valid_cellLines:
            auc[c] = {}
            for l in self.valid_ligands_final[c]:
                auc[c][l] = 0
        return auc

[docs]    def add_auc(self, auc, participant_id):
        if participant_id in self.participants:
            raise ValueError("participant already added (%s)" % participant_id)
        if auc == None:
            auc = self.get_empty_auc()
        else:
            # copy only the combi for the final leaderboard.
            temp_auc = {}
            for c in self.valid_cellLines:
                temp_auc[c] = {}
                for l in self.valid_ligands_final[c]:
                    temp_auc[c][l] = auc[c][l]
        self.aucs.append(temp_auc)

        self.participants.append(participant_id)
        self._compute_ranks()

    def _rank_index(self, vector):
        sorted_vector = sorted(vector)
        return [sorted_vector.index(x) for x in vector]

    def _compute_ranks(self, invalid=False):
        """If there are some invalid submissions, this may not work anymore """
        N = len(self.aucs)
        ranks = {}
        for c in self.valid_cellLines:
            ranks[c] = {}
            for l in self.valid_ligands_final[c]:
                # for final scores, some combi are removed

                # get this AUC for all participant
                aucs = [x[c][l] for x in self.aucs]
                M = len([x[c][l] for x in self.aucs if x[c][l]==0])
                if M>0:
                    print("what is M: ",M)
                # but use a reverse order hence 1-x for the sorting
                if invalid:
                    indices = np.argsort([1-x for x in aucs])
                else:
                    indices = [x+1 for x in self._rank_index([1-x for x in aucs])]

                # need to append all participants that are invalid with same
                # rank that is maxRank
                if invalid:
                    ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))]
                    # reset all final values to max rank
                    maxRank = N-M+1
                    ranks[c][l] = [x if x<maxRank else maxRank for x in ranks[c][l]]
                else:
                    ranks[c][l] = indices[:]

        self.ranks = copy.deepcopy(ranks)

[docs]    def get_ranking(self):
        ranking = {}
        for k in self.participants:
            ranks = self.get_rank_participant(k)
            mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()])
            ranking[k] = mu
        return ranking

[docs]    def get_mean_ranks(self):
        return self.get_ranking()

[docs]    def get_integer_ranks(self):
        ranking = self.get_ranking()
        integer_ranks = {}

        # get teams and ranks
        teams_ranks = list(ranking.items())
        teams = [x[0] for x in teams_ranks]
        mean_ranks = [x[1] for x in teams_ranks]

        for i, index in enumerate(np.argsort(mean_ranks)):
            integer_ranks[teams[int(index)]] = i+1
        return integer_ranks

[docs]    def get_rank_participant(self, participant):
        ranks = {}
        if participant in self.participants:
            index = self.participants.index(participant)
            for c in self.valid_cellLines:
                ranks[c] = {}
                for l in self.valid_ligands_final[c]:
                    ranks[c][l] = self.ranks[c][l][index]
            return ranks
        else:
            raise ValueError("unknown participant")
    # duplicated from HPNScoringNetwork to update leadeboard easily
[docs]    def get_mean_zscores(self):
        zscores = {}
        for i,k in enumerate(self.participants):
            zscore = self._get_zscores(self.aucs[i])
            mu = np.mean([zscore[k1][k2] for k1 in zscore.keys()
                for k2 in zscore[k1].keys()])
            zscores[k] = mu
        return zscores

    # duplicated from HPNScoringNetwork to update leadeboard easily
    def _get_zscores(self, aucs):
        """ """
        mean, sigma = self._get_mean_and_sigma_null_parameters()
        zscores = {}
        for c in self.valid_cellLines:
            zscores[c] = {}
            for l in self.valid_ligands_final[c]:
                zscores[c][l] = (aucs[c][l] - mean[c][l])/sigma[c][l]
        return zscores

    # duplicated from HPNScoringNetwork to update leadeboard easily
    def _get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 32 combi from a null AUC distribution"""
        from . import sc1a_tools
        null = sc1a_tools.AUCnull(self.valid_cellLines, self.valid_ligands, verbose=False)
        filename = 'sc1a_null_aucs_mean_sigma.json'
        filename = self.getpath_data(filename)
        null.loadaucs(filename)
        mean = null.get_mean_dict()
        sigma = null.get_sigma_dict()
        return mean, sigma


[docs]class HPNScoringNetworkInsilico(HPNScoringNetworkBase):
    """Scoring class for HPN DREAM8 Network Insilico sub challenge


    This class retrieves the true graph and a test example from synapse.

    ::

        from dreamtools.dream8.D8C1 import HPNScoringNetworkInsilico
        s = HPNScoringNetworkInsilico()
        import os
        filename = s.download_template("SC1B")
        s.read_file(filename)



    .. note:: If you want to test your own local file, provide a filename.

    """
    test_synapse_id = "syn1973430"
    true_synapse_id = "syn1976597"
    def __init__(self, filename=None, verbose=False):
        super(HPNScoringNetworkInsilico, self).__init__(filename,
                verbose=verbose)

        self.read_file(filename)
        self.true_graph = self._load_true_graph_from_zip()

[docs]    def read_file(self, filename):
        self.filename = filename
        try:
            self.loadZIPFile(self.filename)
        except Exception as err:
            print(err)
            # it not read permission loadZIP and get_eda will fail,
            self.error("Could not read the data (invalid ZIP ?)")

        try:
            self.user_graph = self._get_participant_eda()
        except Exception as err:
            print(err)
            # it not read permission loadZIP and get_eda will fail,
            self.error("Could not compute EDA")

    def _load_true_graph_from_zip(self):
        """Reads true graph  provided by Steven 10 July 2013

        The ZIP file should contain a 20 by 20 matrix

        """
        filename = self.getpath_gs("TrueGraph.csv")
        reader = csv.reader(open(filename, "r"))
        data = np.array(list(reader), dtype="float")
        return data

    def _get_participant_eda(self):
        """This function retrieves the EDA file from a participant


        :return: a numpy matrix with column and row names stored in :attr:`species`

        """
        filenames = [x for x in self.zip_filenames if x.endswith(".eda") and "__MACOSX" not in x]
        assert len(filenames) == 1
        filename = filenames[0]
        node1, node2, edges, scores = self.get_eda_file(filename)

        N = 20
        species =  ["AB"+str(i) for i in range(1,N+1)]

        tempdata  = np.zeros((N, N))
        for x,y,z in zip(node1, node2, scores):
            i = species.index(x.upper())
            j = species.index(y.upper())
            tempdata[i][j] = z

        tempdata = abs(tempdata)

        M = tempdata.max()
        if M>1:
            tempdata /= float(M)
        #self.species = species[:]

        return tempdata

[docs]    def to_eda(self, filename):
        """EXports the user EDA file"""
        fh = open(filename, "w")
        fh.write("EdgeScore\n")
        for i in range(0,20):
            for j in range(0,20):
                fh.write("AB%s (1) AB%s = %s\n" % (i+1, j+1, self.user_graph[i][j]))

        fh.close()

[docs]    def get_roc(self):
        """Gets a ROC instance using thegiven the user and true graphs as inputs"""
        roc = ROC()
        roc.scores = self.user_graph.flatten()
        roc.classes = self.true_graph.flatten()
        return roc

[docs]    def get_auc(self):
        roc = ROC()
        roc.scores = self.user_graph.flatten()
        roc.classes = self.true_graph.flatten()
        roc_data = roc.get_roc()
        auc = roc.compute_auc(roc_data)
        return auc

    def _get_auc(self):
        return self.get_auc()
    auc = property(_get_auc)

[docs]    def get_null_auc_aupr(self, N):
        """Get null distribution of the AUCs and AUPRs

        :param int N: number of samples
        :return: tuple made of 2 lists: the AUCc ad AUPRs

        """

        roc = rocs.ROC()
        roc.scores = self.user_graph.flatten()
        roc.classes = self.true_graph.flatten()

        aucs = []
        auprs = []
        for i in range(0, N):
            if divmod(i, 1000)[1] == 0 and i!=0:
                print("%s completed" % (i/float(N)*100))
            roc.scores = np.random.uniform(0,1,400)
            roc_data = roc.get_roc()
            auc = roc.compute_auc(roc_data)
            aupr = roc.compute_aupr(roc_data)
            aucs.append(auc)
            auprs.append(aupr)
        return aucs, auprs

[docs]    def plot_null_distribution(self, aucs=None, auprs=None, N=10000):
        """Plots the null distribution of the AUCs


        .. plot::
            :include-source:
            :width: 80%

            from dreamtools.dream8.D8C1 import HPNScoringNetworkInsilico
            from dreamtools import D8C1
            import os

            s = HPNScoringNetworkInsilico()
            filename = D8C1().download_template('SC1B')
            s.read_file(filename)
            aucs, auprs = s.get_null_auc_aupr(1000)
            s.plot_null_distribution(aucs)
            from pylab import xlim
            xlim([0.35,0.65])


        """
        if aucs == None and auprs == None:
            aucs, auprs = self.get_null_auc_aupr(N)

        if aucs:
            pylab.figure(1)
            pylab.clf()
            pylab.hist(aucs, bins=100, normed=True)
            pylab.grid(True)
            pylab.xlim([0,1])
            pylab.title("AUCs null distribution")

            import scipy.stats
            mu, sigma = scipy.stats.norm.fit(aucs)
            X = pylab.linspace(0,1,100)
            pylab.plot(X, scipy.stats.norm.pdf(X, loc=mu, scale=sigma), "r")
            #except Exception:
            #    raise Exception

            pylab.savefig("SC1B_aucs_null_distribution.png")

        if auprs:
            pylab.figure(2)
            pylab.clf()
            auprs2 = [x for x in auprs if np.isnan(x)==False]
            pylab.hist(auprs2, bins=100, normed=True)
            pylab.xlim([0,1])
            pylab.grid(True)
            pylab.title("AUCs null distribution")
            pylab.savefig("SC1B_auprs_null_distribution.png")

[docs]    def get_zscore(self):
        """Returns scores for the current submission

        :return: a single value based on the assumption that the distribution of
            the NULL AUC follows a gaussian distribution with parameters that
            are hardcoded as mu=0.497404 and std=0.037436.

            aucs2, auprs2 = s.get_null_auc_aupr(500000)
            scipy.stats.gamma.fit([x for x in auprs if numpy.isnan(x)==False])
            scipy.stats.norm.fit(aucs)

        """
        # based on 1,000,000 simu

        norm_params = {"mu": 0.4974677, "sigma": 0.0373782}
        #gamma_params = {
        #    "a":      7.23283255,
        #    "loc":    0.13995634,
        #    ":scale": 0.00838379}

        roc = self.get_roc()
        roc_data = roc.get_roc()
        auc = roc.compute_auc(roc_data)
        #aupr = roc.compute_aupr(roc_data)
        return (auc-norm_params['mu'])/ norm_params['sigma']

[docs]    def compute_score(self):
        """The official score for the SC1B challenge

        :return: zscore

        """
        try:
            zscore = self.get_zscore()
            return zscore
        except:
            pass


class HPNScoringPredictionBase(HPNScoring):
    def __init__(self, filename=None, verbose=False):
        super(HPNScoringPredictionBase, self).__init__(verbose=verbose)
        self.times = [0, 5, 15, 30, 60, 120,240]
        self.load_species()
        self.filename = filename


[docs]class HPNScoringPrediction(HPNScoringPredictionBase):
    test_synapse_id = "syn2000886"
    true_synapse_id = "syn2009136"   # for now it is local in the SC2A file. Could use
                           # theMIDAS as well
    def __init__(self, filename=None, version=2, verbose=False):
        super(HPNScoringPrediction, self).__init__(filename)
        self.loadZIPFile(self.filename)

        # version 1 is the first LB before publication
        # version 2 is the corrected and final LB after publication
        assert version in [1, 2]
        self._version = version

        c = Challenge('D8C1')
        filename = c.get_pathname("TruePrediction.zip")
        self.true_desc_filename = filename

        # same as species_to_ignore + mTOR + target of the inhibitors email
        # steven hill 10/7/2013
        self.phosphos_to_exclude = {
                'MCF7': ['TAZ_pS89', 'FOXO3a_pS318_S321', 'mTOR_pS2448'],
                'UACC812': ['TAZ_pS89','mTOR_pS2448'],
                'BT20': ['TAZ_pS89', 'FOXO3a_pS318_S321', 'mTOR_pS2448'],
                'BT549': ['TAZ_pS89','mTOR_pS2448']
        }

        self.rmse = {}
        self.get_true_prediction()
        self.get_user_prediction()

[docs]    def get_user_prediction(self):
        """

        should be MIDAS files as in https://www.synapse.org/#!Synapse:syn1973835
        """
        # look only at inhibitor 3
        self.user_prediction = {}

        filenames = [x for x in self.zip_filenames if x.endswith(".csv")
                if "TestInhib3" in x and "MACOS" not in x]

        if len(filenames) != 4:
            self.error("Got (%s) files with a correct pattern. Expected 4" % len(filenames))
            print(filenames)

        if self.verbose:
            print("Loading MIDAS files")

        for filename in filenames:
            tail = os.path.split(filename)[-1]
            team, cell, ligand, other = tail.split("-")
            if cell not in self.valid_cellLines:
                self.error("filename with invalid cellLine %s " % cell)

            # not easy to find universal way of reading zip file and decode it
            # under max/linux/windows across py2 and py3...
            zipdata = self.zip_data.read(filename).decode()
            zipdata = zipdata.splitlines()
            #data_iter = csv.reader(zipdata, delimiter=",")

            # get the header
            header = zipdata[0]
            header = [x.strip() for x in header.split(",")]

            # There should be 56 lines exacly with the 8 stimuli times 7 times
            # for all phosphos
            if len(header) == 0:
                self.error("Cannot read the header : %s in %s" % (header, filename ))
                return

            if header[0] != "TR:%s:CellLine" % cell:
                self.error("Error in the header of %s (first row)" % filename )

            stimuli = [x.split(":")[1] for x in header[1:9]]
            if sorted(stimuli) != sorted(self.valid_ligands):
                self.error("Error in the header of %s (invalid stimuli ?)" % filename )

            if header[9] not in ["DA:ALL", "DA::ALL"]:
                self.error("Expected a DA:ALL column in the header at column 10, which was not found")

            phosphos = [x.split(":")[1] for x in header[10:]]

            if len(phosphos) != self.valid_length[cell] and len(phosphos) != self.valid_length_extended[cell]:
                print("WARNING: length don't seem correct")

            if "PKC-delta_pS664" in phosphos:
                index = phosphos.index("PKC-delta_pS664")
                phosphos[index] = "PKC-delta"

            self.user_prediction[cell] = {}
            for phospho in phosphos:
                self.user_prediction[cell][phospho] = {}
                for ligand in self.valid_ligands:
                    self.user_prediction[cell][phospho][ligand] = [0,0,0,0,0,0,0] # 7 times

            # TODO: check that phosphos agree with the true prediction ?
            for iter, row in enumerate(zipdata[1:]):
                row = row.split(",")# get time index
                if len(row)==0:
                    continue
                time = int(row[9])
                itime = self.times.index(time)

                # get the stimuli name for that row
                this_stimuli = [int(x) for x in row[1:9]]
                if sum(this_stimuli) !=1:
                    self.error("Error: found a row with 0 or more than 1 stimuli on row %s (cell %s)" % (iter, cell))
                this_stimulus = self.valid_ligands[this_stimuli.index(1)] # get the unique ligand name

                for ip, phospho in enumerate(phosphos):
                    datum = row[10 + ip]
                    if datum == "NA":
                        datum = np.nan
                    else:
                        try:
                            datum = float(datum)
                        except Exception:
                            print(row)
                            raise Exception
                    self.user_prediction[cell][phospho][this_stimulus][itime] = datum

[docs]    def get_true_prediction(self):
        """Reads true predcition from the 4 CSV files that contain the true prediction

        data is stored as follows in the tru_prediction attribute:

        """
        self.true_prediction = {}

        zipdata = zipfile.ZipFile(self.true_desc_filename)

        for cell in self.valid_cellLines:
            self.true_prediction[cell] = {}

            filename = [x for x in zipdata.namelist() if "%s_main_Test.csv" % cell in x][0]
            data = zipdata.read(filename).decode().splitlines()

            #filename = "SC2A/%s_main_Test.csv" % cell
            #print("Reading true prediction from %s" % filename)
            #fh = open(filename, "r")
            #data = csv.reader(fh)
            # skip first line
            data = data[1:]

            if cell == "UACC812": # skip another line
                data = data[1:]

            names = data[0][:]
            hugoId = data[1][:]
            # skip another line
            data = data[3:]
            #dummy = str(next(data))

            names = [x.strip() for x in names.split(",")]
            # now the data with col1 = cellline, co2=inhibito, col3=stimulus,
            # col4=timepoints
            indices_to_exclude = [0, 1, 2, 3]
            # create the phospho keys
            for this in self.phosphos_to_exclude[cell]:
                index = names.index(this)
                indices_to_exclude.append(index)

            # create the key to hold this cell line
            self.true_prediction[cell] = {}

            # create the key to hold all phosphos
            for i,name in enumerate(names):
                if i not in indices_to_exclude:
                    self.true_prediction[cell][name] = {}
                    # create the key to hold all ligans
                    for ligand in self.valid_ligands:
                        # build up the 7-length vector to hold the data
                        # each element is a list so that we can append
                        # replicates if any
                        # note a python caveat [[]] * 7 creates 7 references
                        # empty lists. don't use.
                        self.true_prediction[cell][name][ligand] = [[],[],[],[],[],[],[]]
                else:
                    pass

            # now scan the data
            times = ["0min", "5min", "15min", "30min", "60min", "2hr", "4hr"]
            for row in data:
                row = [x.strip() for x in str(row).split(",")]
                if row[1] == "AZD8055": # this is the inhibitor we are interested in
                    ligand = row[2]
                    if ligand not in self.valid_ligands and ligand!="":
                        raise ValueError("Invalid ligand")

                    # get time and correspoding index in the list of valid times
                    thistime = row[3]
                    time_index = times.index(thistime)

                    # append the data to the relevant phospho index
                    for i, phospho in enumerate(names):
                        # ignore the indices related to FOX, TAX, mTor
                        if i not in indices_to_exclude:
                            datum = row[i]
                            if ligand == "": # happends for time=0min only
                                assert time_index==0
                                for this_ligand in self.valid_ligands:
                                    self.true_prediction[cell][phospho][this_ligand][0].append(float(datum))
                            else:
                                assert time_index!=0
                                self.true_prediction[cell][phospho][ligand][time_index].append(float(datum))

            # Finally, we need to get the average to take care of replicates
            for phospho in self.true_prediction[cell].keys():
                for ligand in self.true_prediction[cell][phospho].keys():
                    for i,time in enumerate(times):
                        vector = self.true_prediction[cell][phospho][ligand][i]

                        if len(vector)==0:
                            m = np.nan
                        else:
                            m = 2**(sum([np.log2(x) for x in vector])/float(len(vector)))
                        self.true_prediction[cell][phospho][ligand][i] = m

        # some sanity checks
        for key in self.valid_length.keys():
            assert self.valid_length_extended[key]-len(self.phosphos_to_exclude[key]) == len(self.true_prediction[key].keys())

[docs]    def get_rmse(self, cellLine, phospho):
        """

        .. warning:: x in converted into a log2 scale
        """

        RMSE = 0
        #T = 7.
        #S = 8.

        counter = 0
        for s in self.valid_ligands:
            true  = np.array(self.true_prediction[cellLine][phospho][s])
            user  = np.array(self.user_prediction[cellLine][phospho][s])
            # some values provided may be set to 0
            # so remove the infinite caused by log(0).
            #RMSE += nansum( [x for x in (log2(true)-log2(user))**2 if x!=inf]  )
            data = [x for x in (np.log2(true)-np.log2(user))**2 if x!=np.inf]
            N = len([x for x in data if np.isnan(x) == False])
            try:
                import bottleneck as bn
                RMSE += bn.nansum(data)
            except:
                RMSE += np.nansum(data)


            #RMSE += np.nansum(data)
            counter += N
        # TODO: if nans are include, do we need to change T ?
        if counter != 0:
            RMSE /= float(counter)
            RMSE = np.sqrt(RMSE)
        else:
            RMSE = np.nan
        return RMSE

[docs]    def compute_all_rmse(self):
        """Some species were removed on purpose during the analysis

        Those are hardcoded. To compute null distribution, we can keep
        all the species, in which case, _version parameter must be 0
        must be set to False.
        """
        self.rmse = {}
        for c in self.valid_cellLines:
            self.rmse[c] = {}
            for l in self.species[c]:

                if self._version == 1:
                    if c == "MCF7" and l in [ 'c-Met_pY1235', "YB-1_PS102"]:
                        continue
                    if c == "BT20" and l in ["4EBP1_pS65", '4EBP1_pT37_pT46', "c-Met_pY1235"]:
                        continue
                    if c == "UACC812" and l in ["4EBP1_pT37_pT46", "4EBP1_pS65"]:
                        continue
                elif self._version == 2:
                    if c == "MCF7" and l in ['c-Met_pY1235', "YB-1_PS102",
                            "PDK1_pS241"]:
                        continue
                    if c == "BT20" and l in ["4EBP1_pS65", '4EBP1_pT37_pT46', "c-Met_pY1235"]:
                        continue
                    if c == "UACC812" and l in ["4EBP1_pT37_pT46", "4EBP1_pS65",
                            "CHK1_pS345", "NDRG1_pT346", 'PDK1_pS241',
                            'PKC-alpha_pS657', 'PKC-delta']:
                        continue
                    if c == 'BT549' and l in ['BAD_pS112', 'HER3_pY1298',
                            'PKC-delta']:
                        continue

                else: # keep going don't skip anything (for null distro)
                    pass

                if l not in self.phosphos_to_exclude[c]:
                    rmse = self.get_rmse(c, l)
                    self.rmse[c][l] = rmse
                else:
                    if self.verbose:print("skip %s/%s" % (c,l))

[docs]    def get_training_data(self):
        self.training = {}
        filenames = [
                "MD-BT20_main_for_null.csv",
                "MD-BT549_main_for_null.csv",
                "MD-MCF7_main_for_null.csv",
                "MD-UACC812_main_for_null.csv"]

        for filename in filenames:
            dummy, cell = filename.split("-")
            cell, other = cell.split("_", 1)
            filename = self.getpath_data(filename)
            if self.verbose:
                print("Scanning %s" % filename)

            data_iter = csv.reader(open(filename, "r"), delimiter=",")

            # get the header
            header = data_iter.next()
            # There should be 56 lines exacly with the 8 stimuli times 7 times
            # for all phosphos
            if header[0] != "TR:%s:CellLine" % cell:
                self.error("Error in the header of %s (first row)" % filename )

            stimuli = [x.split(":")[1] for x in header[1:9]]
            if sorted(stimuli) != sorted(self.valid_ligands):
                self.error("Error in the header of %s (invalid stimuli ?)" % filename )

            N = 3
            if cell == "BT549":
                N = 2

            #inhibitors = []
            if header[8+N+1] != "DA:ALL":
                self.error("Expected a DA:ALL column in the header at column 11 or 12, which was not found")

            phosphos = [x.split(":")[1] for x in header[N+8+2:]]

            if len(phosphos) != self.valid_length[cell] and len(phosphos) != self.valid_length_extended[cell]:
                print("WARNING: length don't seem correct")

            #if "PKC-delta_pS664" in phosphos:
            #    index = phosphos.index("PKC-delta_pS664")
            #    phosphos[index] = "PKC-delta"

            self.training[cell] = {}
            for phospho in phosphos:
                self.training[cell][phospho] = []

            # TODO: check that phosphos agree with the true prediction ?
            for iter, row in enumerate(data_iter):
                # append all data for a given phosphos over all stims,
                # inhibitor+DMSO and times
                for ip, phospho in enumerate(phosphos):
                    datum = row[N+8+2+ip]
                    if datum == "NA":
                        datum = np.nan
                    else:
                        datum = float(datum)

                    self.training[cell][phospho].append(datum)

[docs]    def get_null(self, N=100, tag="sc2a"):
        """

        ::

            s = HPNScoringPrediction()
            nulls = s.get_null(1000)
            # the nulls contains the 4 cell lines
            # let us save the first one
            for name in ['UACC812', 'BT549', 'MCF7', 'BT20']:
                data = [x[name] for x in nulls]
                fh = open('%s.json' % name, 'w')
                import json
                json.dump(data, fh)
                fh.close()

        """
        self.get_training_data()

        all_rmses = []
        for i in range(0, N):
            print(tag + " " + str(i))
            temp = self._filter_species
            self.user_prediction = self.create_random_data()
            self.compute_all_rmse()
            self._filter_species = temp
            all_rmses.append(copy.deepcopy(self.rmse))
        return all_rmses

[docs]    def create_random_data(self):
        """ Here, we don't want the true prediction that contains only what is
        requested (AZD8055) but the orignal training data with 2 or 3
        inhibitors such as GSK and PD17 so that we can shuffle them.

        We want to select for a given cell line and phosphos a data set to fill
        at a given time. The datum is selected accross the 8 stimuli, inhibitors
        +DMSO, and time points.

        TAZ and FOX were asked to be excluded so this cause some trouble now but
        some user preidction still include them. Should add a if statement to
        ignore them. Does not matter to compute the null distribution
        """
        import random
        #data = copy.deepcopy(self.user_prediction)
        data = {}
        self.compute_all_rmse()

        for c in self.rmse.keys():
            data[c] = {}
            for p in self.rmse[c].keys():
                data[c][p] = {}
                #for s in data[c][p].keys():
                for s in self.ligands_names_steven:
                    # select 7 random values for 7 time points for each stimuli
                    data[c][p][s] = random.sample(self.training[c][p], 7)[:]
        return data

    def _get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 4 celllines and phosphos"""
        filename = 'sc2a_null_mu_sigma_new.json'
        filename = os.sep.join(["data", filename])
        with open(filename) as fh:
            mu_sigma = json.load(fh)
        mu = {}
        sigma = {}
        for c in self.valid_cellLines:
            mu[c] = {}
            sigma[c] = {}
            for p in self.valid_phosphos[c]:
                mu[c][p] = mu_sigma[c][p]['mu']
                sigma[c][p] = mu_sigma[c][p]['sigma']
        return mu, sigma

[docs]    def get_mean_rmse(self):
        return np.nanmean([self.rmse[k1][k2] for k1 in self.rmse.keys()
            for k2 in self.rmse[k1].keys()])


[docs]class HPNScoringPredictionInsilico(HPNScoringPredictionBase):
    """

    dimension1 :inhibitor
    dimenssion2: phosp
    dimensson3 stimulus
    dimnesion4 : time


    """
    test_synapse_id =  "syn2009175"
    true_synapse_id = "syn2143242"

    def __init__(self, filename=None, verbose=False, version=2):
        """SC2B sub challenge (prediction in silico)

        :param filename: file to score
        :param str version: default to 'official' (see note below). Set to
            anything else to use correct network

        .. note:: This code use the official gold standard used
            in https://www.synapse.org/#!Synapse:syn1720047/wiki/60532 . Note,
            however, that a new corrected version is now provided and may be used.
            Differences with the official version should be small and have no effect on
            the ranking shown in the synapse page.
        """
        super(HPNScoringPredictionInsilico, self).__init__(filename)
        # WRONG NETWORK as used in the official LB before final corrections
        self.version = version
        if self.version == 1:
            c = Challenge('D8C1')
            fname = c.get_pathname("TruePredictionInsilico.zip")
        elif self.version == 2:
            # CORRECT NETWORK
            c = Challenge('D8C1')
            fname = c.get_pathname("TruePredictionInsilico2.zip")
        else:
            raise ValueError("version must be either 1 or 2")
        self.true_desc_filename = fname

        #self.loadZIPFile(self.filename)
        self.valid_cellLines = [""]
        self.times = [0, 1,2,4,6,10,15,30,60,120]
        self.stimuli = ["loLIG1", "hiLIG1", "loLIG2", "hiLIG2",
                "loLIG1_loLIG2", "loLIG1_hiLIG2", "hiLIG1_loLIG2",
                "hiLIG1_hiLIG2"]
        self.inhibitors = ["AB%s"%x for x in range(1,21)]
        self.phosphos = ["AB%s"%x for x in range(1,21)]

        if self.filename is not None:
            self.get_user_prediction()
        #except:
        #    print("could not read user prediction")

        self.get_true_prediction()

[docs]    def get_user_prediction(self):
        results = self.read_prediction_insilico(self.filename)
        self.user_prediction = copy.deepcopy(results)

[docs]    def get_true_prediction(self):
        #results = self.read_prediction_insilico(self.true_desc_filename)
        results = self.read_true_prediction_michael(self.true_desc_filename)
        self.true_prediction = copy.deepcopy(results)

[docs]    def read_true_prediction_michael(self, filename):
        # this is almost the same as in the read_prediction
        results = {}
        zipdata = zipfile.ZipFile(filename)
        for inhib in self.inhibitors: # AB19 is missing from Michael set.
            expected_ending = "-%s-Prediction-Insilico.csv" % inhib
            filename = [x for x in zipdata.namelist() if x.endswith(expected_ending)]
            if len(filename) != 1:
                self.error("Could not find file ending in %s" % expected_ending)
                break
            filename = filename[0]
            # initialise dictionary for the inhibitor
            results[inhib] = {}

            # read data
            data = zipdata.read(filename).decode() # from bytes to str
            data = data.splitlines()

            # get header
            header = data[0]
            header = [x.strip() for x in header.split(",")]

            stimuli = [x.split(":")[1] for x in header if x.endswith("Stimuli")]
            phosphos = [x.split(":")[1] for x in header if x.startswith("DV:")]
            assert len(phosphos)==20

            assert len(stimuli) == 4
            assert stimuli == self.stimuli[0:4], "incorrect stimuli %s " % stimuli

            # create the key to hold all phosphos
            for i,phospho in enumerate(phosphos):
                results[inhib][phospho] = {}
                # create the key to hold all ligans
                for stimulus in self.stimuli:
                    results[inhib][phospho][stimulus] = [[] for i in range(10)]
                else:
                    pass

            # now scan the data
            for row in data[1:]:
                row = [x.strip() for x in row.split(",")]

                these_stimuli = row[1:5] #we have only 4 stimuli im Michael file.
                # However, several may be st to 1
                stim_index = [i for  i,stim in enumerate(these_stimuli) if stim == '1' ]

                assert len(stim_index)==1 or len(stim_index)==2
                if len(stim_index) == 1:
                    stim_index = stim_index[0]
                else:
                    # ["loLIG1", "hiLIG1", "loLIG2", "hiLIG2",  "loLIG1_loLIG2", "loLIG1_hiLIG2", "hiLIG1_loLIG2", "hiLIG1_hiLIG2"]
                    if sorted(stim_index) == [0,2]: #loLIG1_loLIG2
                        stim_index = 4
                    elif sorted(stim_index) == [0,3]: #loLIG1_hiLIG2
                        stim_index = 5
                    elif sorted(stim_index) == [1,2]: #hiLIG1_loLIG2
                        stim_index = 6
                    elif sorted(stim_index) == [1,3]: #hiLIG1_hiLIG2
                        stim_index = 7
                    else:
                        raise ValueError

                # get time and correspoding index in the list of valid times
                thistime = float(row[5])
                if thistime == 45:
                    # to skip. THere is an extra time to be ignored since we
                    # asked users not to provide the time 45
                    continue
                time_index = self.times.index(thistime)

                # append the data to the relevant phospho index
                for i, phospho in enumerate(phosphos):
                    # ignore the indices related to FOX, TAX, mTor
                    datum = row[i+1+4+1] # +cell line +1 time + 4 stimuli
                    if datum == "NA":
                        results[inhib][phospho][self.stimuli[stim_index]][time_index]= np.nan
                    else:
                        results[inhib][phospho][self.stimuli[stim_index]][time_index] = float(datum)

        return results

[docs]    def read_prediction_insilico(self, filename):
        """Reads true predcition from the 20 CSV files"""
        results = {}
        zipdata = zipfile.ZipFile(filename)

        for inhib in self.inhibitors:
            expected_ending = "-%s-Prediction-Insilico.csv" % inhib
            filename = [x for x in zipdata.namelist() if x.endswith(expected_ending) and "MACOS" not in x]

            if len(filename) != 1:
                self.error("Could not find file ending in %s" % expected_ending)
                break
            filename = filename[0]

            # initialise dictionary for the inhibitor
            results[inhib] = {}

            # read data
            data = zipdata.read(filename).decode()
            data = data.splitlines()


            # get header
            header = data[0]
            header = [x.strip() for x in header.split(",")]

            stimuli = [x.split(":")[1] for x in header if x.endswith("Stimuli")]
            stimuli = [x.replace("+", "_") for x in stimuli]  # some users put "+" instead of "_" ...
            phosphos = [x.split(":")[1] for x in header if x.startswith("DV:")]
            if len(phosphos) != 20:
                self.error("Number of expected phosphos (20 )is incorrect (Found %s in %s)  "
                        %(len(phosphos), filename))
                continue

            if len(stimuli) != 8:
                self.error("Number of expected stimuli (8) is incorrect (Found %s in %s)  "
                        %(len(stimuli), filename))
                continue

            for s in stimuli:
                if s not in self.stimuli:
                    self.error("found incorrect stimuli (%s) in %s" % (s, filename))
                    continue

            # create the key to hold all phosphos
            for i,phospho in enumerate(phosphos):
                results[inhib][phospho] = {}
                # create the key to hold all ligans
                for stimulus in self.stimuli:
                    results[inhib][phospho][stimulus] = [[] for i in range(10)]
                else:
                    pass

            # now scan the data
            # skip header
            for i, row in enumerate(data[1:]):
                row = [x.strip() for x in row.split(",")]

                these_stimuli = [int(float(x)) for x in row[1:9]]
                stim_index = [i for  i, stim in enumerate(these_stimuli) if stim == 1 ]
                #assert len(stim_index)==0, "no stimuli found on line %s in %s" % (i, filename)
                assert len(stim_index)==1, "more than 1 stimuli on line %s in %s" % (i, filename)
                stim_index = stim_index[0]

                # get time and correspoding index in the list of valid times
                thistime = float(row[9])
                time_index = self.times.index(thistime)

                # append the data to the relevant phospho index
                for i, phospho in enumerate(phosphos):
                    # ignore the indices related to FOX, TAX, mTor
                    datum = row[i+1+8+1] # +cell line +1 time + 8 stimuli
                    if datum == "NA" or datum =='?':
                        results[inhib][phospho][stimuli[stim_index]][time_index]= np.nan
                    else:
                        results[inhib][phospho][stimuli[stim_index]][time_index] = float(datum)

        return results

[docs]    def get_rmse(self, inhibitor, phospho):
        """

        .. warning:: x in converted into a log2 scale
        """
        RMSE = 0
        #T = 7.
        #S = 8.

        counter = 0
        for s in self.stimuli:
            true  = np.array(self.true_prediction[inhibitor][phospho][s])
            user  = np.array(self.user_prediction[inhibitor][phospho][s])
            data = [x for x in (np.log2(true)-np.log2(user))**2 if x!=np.inf]
            N = len([x for x in data if np.isnan(x) == False])
            #N = len(data)
            try:
                import bottleneck as bn
                RMSE += bn.nansum(data)
            except:
                RMSE += np.nansum(data)
            counter += N

        # TODO: if nans are include, do we need to change T ?
        if counter != 0:
            RMSE /= float(counter)
            RMSE = np.sqrt(RMSE)
        else:
            RMSE = np.nan

        return RMSE

[docs]    def compute_all_rmse(self, null=False):
        self.rmse = {}
        for c in self.inhibitors:
            if c == "AB9":
                continue
            self.rmse[c] = {}
            for l in self.phosphos:
                if l==c:
                    continue
                if l in ["AB3", "AB13", "AB18"] or c in ["AB3", "AB13", "AB18"]:
                    continue

                # those nodes have been ignored in the official leaderboards
                # based on the participants submissions.
                # To get the same results for a new participants, those nodes
                # should be keep the same. Although, we a entirely new set of
                # participants, those should be updated.
                if null is True:
                    dummy_nodes = []
                else:
                    #computed using 100,000 null distributions
                    dummy_nodes = [(2,1), (6,1),(7,1), (6,4),(2,5), (6,5),(16,6),
                        (16,7), (6,8),
                        (16,8),(6,9),(7,9),(16,9),(20,9), (6,10),(6,12),(2,14),(6,14),
                        (7,14), (2,15), (16,15), (5,16), (5,17), (5,19), (16,19)]

                    if self.version == 2:
                        dummy_nodes = [(16,4), (16,14)]
                stop = False
                for node in dummy_nodes:
                    n1 = "AB" + str(node[0])
                    n2 = "AB" + str(node[1])
                    if c == n1 and l == n2:
                        stop = True
                if stop:
                    continue
                if l != c:
                    rmse = self.get_rmse(c, l)
                    self.rmse[c][l] = rmse

[docs]    def get_training_data(self):
        self.training = {}
        filenames = self.getpath_data("MD_insilico.csv")

        for filename in filenames:
            if self.verbose:
                print("Scanning %s" % filename)

            data_iter = csv.reader(open(filename, "r"), delimiter=",")

            # get the header
            header = data_iter.next()
            # There should be 56 lines exacly with the 8 stimuli times 7 times
            # for all phosphos
            if header[0] != "TR:inSilico:CellLine":
                self.error("Error in the header of %s (first row)" % filename )

            stimuli = [x.split(":")[1] for x in header[1:8]]
            #inhibitors = []
            if header[8] != "DA:ALL":
                self.error("Expected a DA:ALL column in the header at column 9, which was not found")

            phosphos = [x.split(":")[1] for x in header[9:]]

            self.training = {}
            for phospho in phosphos:
                self.training[phospho] = []

            for iter, row in enumerate(data_iter):
                # append all data for a given phosphos over all stims,
                # inhibitor+DMSO and times
                for ip, phospho in enumerate(phosphos):
                    datum = row[7+2+ip]
                    if datum == "NA":
                        datum = np.nan
                    else:
                        datum = float(datum)
                    self.training[phospho].append(datum)

[docs]    def get_null(self, N=100, tag="sc2b"):
        self.get_training_data()

        all_rmses = []
        for i in range(0, N):
            print(tag + " " + str(i))
            self.user_prediction = self.create_random_data()
            self.compute_all_rmse(null=True)
            all_rmses.append(copy.deepcopy(self.rmse))
        return all_rmses

[docs]    def create_random_data(self):

        """ Here, we don't want the true prediction that contains only what is
        requested (AZD8055) but the orignal training data with 2 or 3 inhibitors such as
        GSK and PD17 so that we can shuffle them.

        We want to select for a given cell line and phosphos a data set to fill
        at a given time. The datum is slected accross the 8 stimuli, inhibitors
        +DMSO, and time points.
        """
        import random
        #data = copy.deepcopy(self.true_prediction)
        data = {}

        for c in self.training.keys():
            data[c] = {}
            for p in self.training.keys():
                data[c][p] = {}
                for s in self.true_prediction[c][p].keys():
                    # select randomly for 10 time points for each
                    # inhibitor (8) (no 45 minute point).
                    data[c][p][s] = random.sample(self.training[c], 10)[:]
        return data

    def _get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 32 combi from a null AUC distribution"""
        filename = 'sc2b_null_mu_sigma.json'
        filename = self.getpath_data(filename)
        # a dict contain dict (phospho) of dict (phospho) of dict (mu,sigma)
        with open(filename) as fh:
            mu_sigma = json.load(fh)
        mu = {}
        sigma = {}
        for c in self.phosphos:
            mu[c] = {}
            sigma[c] = {}
            for p in self.phosphos:
                if p == c:
                    mu[c][p] = np.nan
                    sigma[c][p] = np.nan
                else:
                    mu[c][p] = mu_sigma[c][p]['mu']
                    sigma[c][p] = mu_sigma[c][p]['sigma']
        return mu, sigma

[docs]    def get_mean_zscores(self):
        zscore = self.get_zscores(self.rmse)
        data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()]
        mu = np.mean([x for x in data if np.isnan(x)==False])
        return mu

[docs]    def get_zscores(self, rmses=None):
        mu, sigma = self._get_mean_and_sigma_null_parameters()
        zscores = {}
        for c in self.inhibitors:
            zscores[c] = {}
            for p in self.phosphos:
                if c!=p:
                    if rmses!=None:
                        zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p]
                    else:
                        zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p]
                else:
                    zscores[c][p] = np.nan
        return zscores

[docs]    def get_mean_rmse(self):
        d = self.rmse
        return np.mean([d[k1][k2] for k1 in d.keys() for k2 in d[k1].keys() if
            np.isnan(d[k1][k2])==False])


[docs]class HPNScoringPrediction_ranking(HPNScoring):
    """This class is used to compute the ranks of the different participants
    based on an average rank over the 4 cell lines times phosphos

    ::

        s = HPNScoringPrediction(filename="file1zip")
        s.compute_all_rmes()

        r = HPNScoringPrediction_ranking()
        # s.aucs is a list where each element is a dictionary of
        r.add_rme(s1.rmse, "team1")

        rmse1 = r.get_randomised_rmse(r.rmse[0], sigma=1)
        rmse2 = r.get_randomised_rmse(r.rmse[0], sigma=2)
        rmse3 = r.get_randomised_rmse(r.rmse[0], sigma=3)

        r.add_rmse(rmse1, "team2")
        r.add_rmse(rmse2, "team3")
        r.add_rmse(rmse3, "team4")

        sall.add_rmse(s2.rmse, "team2")
        sall.get_ranking()
        {'team1': 1.96875, 'team2': 1.03125}


    This class is independent of HPNSCoringPrediction.
    However, it takes as input the returned values of
    HPNScoringPrediction.compute_all_rmse()

    """
    def __init__(self):
        super(HPNScoringPrediction_ranking, self).__init__()
        self.valid_cellLines = ['UACC812', 'MCF7', 'BT549', 'BT20']
        self.rmse = []
        self.participants = []

    def _get_empty_rmse(self):
        data = {}
        for c in self.valid_cellLines:
            data[c] = {}
            for s in self.species[c]:
                data[c][s] = np.inf
        return data

[docs]    def add_rmse(self, data, participant_id):
        if participant_id in self.participants:
            raise ValueError("participant already added")
        if data == None:
            data = self._get_empty_rmse()
        self.rmse.append(copy.deepcopy(data))
        self.participants.append(participant_id)
        self._compute_ranks()

    def _get_species(self):
        if len(self.rmse)>=1:
            species = {}
            for c in self.valid_cellLines:
                species[c] = self.rmse[0][c].keys()
            return species
        else:
            print("you must provide a rmse dict at least to get the species")
    species = property(_get_species)

    def _rank_index(self, vector):
        sorted_vector = sorted(vector)
        return [sorted_vector.index(x) for x in vector]

    def _compute_ranks(self):
        ranks = {}
        for c in self.valid_cellLines:
            ranks[c] = {}
            for l in self.species[c]:
                # get this RMSE for all participant
                data = [x[c][l] for x in self.rmse]
                # but use a reverse order hence 1-x for the sorting
                indices = np.argsort([x for x in data])
                ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))]
        self.ranks = copy.deepcopy(ranks)

[docs]    def get_ranking(self):
        ranking = {}
        for k in self.participants:
            ranks = self.get_rank_participant(k)
            mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()])
            ranking[k] = mu
        return ranking

[docs]    def get_mean_ranks(self):
        return self.get_ranking()

[docs]    def get_integer_ranks(self):
        ranking = self.get_ranking()
        integer_ranks = {}

        # get teams and ranks
        teams_ranks = list(ranking.items())
        teams = [x[0] for x in teams_ranks]
        mean_ranks = [x[1] for x in teams_ranks]

        for i, index in enumerate(np.argsort(mean_ranks)):
            integer_ranks[teams[int(index)]] = i+1
        return integer_ranks

[docs]    def get_rank_participant(self, participant):
        ranks = {}
        if participant in self.participants:
            index = self.participants.index(participant)
            for c in self.valid_cellLines:
                ranks[c] = {}
                for l in self.species[c]:
                    ranks[c][l] = self.ranks[c][l][index]
            return ranks
        else:
            raise ValueError("unknown participant")

[docs]    def get_randomised_rmse(self, rmse, sigma=1):
        """THis is useful for testing. See class documentaton"""
        new_rmse = copy.deepcopy(rmse)
        for c in self.valid_cellLines:
            for l in self.species[c]:
                x = new_rmse[c][l]
                new_rmse[c][l] = np.uniform(-1 * x, x*2) + x # new values is between 0 and sigma*x
                #new_rmse[c][l] = x +  sigma + x # new values is between 0 and sigma*x
        return new_rmse

    # Duplicated from HPNScoringPrediction
    def _get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 4 celllines and phosphos"""
        filename = 'sc2a_null_mu_sigma_new.json'
        filename = self.getpath_data(filename)
        with open(filename) as fh:
            mu_sigma = json.load(fh)
        mu = {}
        sigma = {}
        for c in self.valid_cellLines:
            mu[c] = {}
            sigma[c] = {}
            for p in self.valid_phosphos[c]:
                mu[c][p] = mu_sigma[c][p]['mu']
                sigma[c][p] = mu_sigma[c][p]['sigma']
        return mu, sigma

    # Duplicated from HPNScoringPrediction
[docs]    def get_mean_zscores(self):
        zscores = {}
        for i,k in enumerate(self.participants):
            zscore = self._get_zscores(self.rmse[i])
            data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()]
            data = [x for x in data if np.isnan(x)==False]
            mu = np.mean(data)
            zscores[k] = mu
        return zscores

    # Duplicated from HPNScoringPrediction
    def _get_zscores(self, rmses=None):
        mu, sigma = self._get_mean_and_sigma_null_parameters()
        zscores = {}
        for c in self.valid_cellLines:
            zscores[c] = {}
            for p in self.valid_phosphos[c]:
                if rmses!=None:
                    zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p]
                else:
                    zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p]
        return zscores

    def _get_valid_phosphos(self):
        rmse = None
        for x in self.rmse:
            if x != None:
                rmse = x
                break
        if rmse == None:
            raise ValueError("No RMSE found. please add one (not None) using add_rmse")
        valid_phosphos = {}
        for c in rmse.keys():
            valid_phosphos[c] = rmse[c].keys()
        return valid_phosphos
    valid_phosphos = property(_get_valid_phosphos)


[docs]class HPNScoringPredictionInsilico_ranking(HPNScoring):
    """This class is used to compute the ranks of the different participants
    based on an average rank over the 4 cell lines times phosphos

    ::

        s = HPNScoringPredictionInsilico(filename="file1zip")
        s.compute_all_rmes()

        r = HPNScoringPrediction_ranking()
        # s.aucs is a list where each element is a dictionary of
        r.add_rme(s1.rmse, "team1")

        rmse1 = r.get_randomised_rmse(r.rmse[0], sigma=1)
        rmse2 = r.get_randomised_rmse(r.rmse[0], sigma=2)
        rmse3 = r.get_randomised_rmse(r.rmse[0], sigma=3)

        r.add_rmse(rmse1, "team2")
        r.add_rmse(rmse2, "team3")
        r.add_rmse(rmse3, "team4")

        sall.add_rmse(s2.rmse, "team2")
        sall.get_ranking()
        {'team1': 1.96875, 'team2': 1.03125}


    This class is independent of HPNSCoringPrediction.
    However, it takes as input the returned values of
    HPNScoringPrediction.compute_all_rmse()

    """
    def __init__(self):
        super(HPNScoringPredictionInsilico_ranking, self).__init__()
        self.times = [0, 1,2,4,6,10,15,30,60,120]
        self.inhibitors = ["AB%s"%x for x in range(1,21)]
        self.phosphos = ["AB%s"%x for x in range(1,21)]

        self.ignore = ['AB3', 'AB13', 'AB18']
        for this in self.ignore:
            self.inhibitors.remove(this)
            self.phosphos.remove(this)
        self.inhibitors.remove("AB9")

        self.rmse = []
        self.participants = []

    def _get_empty_rmse(self):
        data = {}
        for c in self.inhibitors:
            data[c] = {}
            for s in self.phosphos:
                data[c][s] = np.inf
        return data

[docs]    def add_rmse(self, data, participant_id):
        if participant_id in self.participants:
            raise ValueError("participant already added")
        if data == None:
            data = self._get_empty_rmse()
        self.rmse.append(copy.deepcopy(data))
        self.participants.append(participant_id)
        self._compute_ranks()

    def _rank_index(self, vector):
        sorted_vector = sorted(vector)
        return [sorted_vector.index(x) for x in vector]

    def _compute_ranks(self):
        ranks = {}
        for c in self.inhibitors:
            ranks[c] = {}
            phosphos = self.rmse[0][c].keys()
            for l in phosphos:
                if l!=c:
                    # get this RMSE for all participant
                    data = [x[c][l] for x in self.rmse]

                    # but use a reverse order hence 1-x for the sorting
                    indices = np.argsort([x for x in data])
                    ranks[c][l] = [list(indices).index(i)+1 for i in range(0,len(indices))]

        self.ranks = copy.deepcopy(ranks)

[docs]    def get_ranking(self):
        ranking = {}
        for k in self.participants:
            ranks = self.get_rank_participant(k)
            mu = np.mean([ranks[k1][k2] for k1 in ranks.keys() for k2 in ranks[k1].keys()])
            ranking[k] = mu
        return ranking

[docs]    def get_mean_ranks(self):
        return self.get_ranking()

[docs]    def get_integer_ranks(self):
        ranking = self.get_ranking()
        integer_ranks = {}

        # get teams and ranks
        teams_ranks = list(ranking.items())
        teams = [x[0] for x in teams_ranks]
        mean_ranks = [x[1] for x in teams_ranks]

        for i, index in enumerate(np.argsort(mean_ranks)):
            integer_ranks[teams[int(index)]] = i+1
        return integer_ranks

[docs]    def get_rank_participant(self, participant):
        ranks = {}
        if participant in self.participants:
            index = self.participants.index(participant)
            for c in self.inhibitors:
                ranks[c] = {}
                phosphos = self.rmse[0][c].keys()
                for l in phosphos:
                    if c!=l:
                        ranks[c][l] = self.ranks[c][l][index]
            return ranks
        else:
            raise ValueError("unknown participant")

[docs]    def get_randomised_rmse(self, rmse, sigma=1):
        """THis is useful for testing. See class documentaton"""
        new_rmse = copy.deepcopy(rmse)
        for c in self.inhibitors:
            phosphos = self.rmse[0][c].keys()
            for l in phosphos:
                if c!=l:
                    x = new_rmse[c][l]
                    new_rmse[c][l] = np.uniform(-1 * x, x*sigma) + x # new values is between 0 and sigma*x
                    #new_rmse[c][l] = x +  sigma + x # new values is between 0 and sigma*x
        return new_rmse

[docs]    def get_mean_zscores(self):
        zscores = {}
        for i,k in enumerate(self.participants):
            zscore = self._get_zscores(self.rmse[i])
            data = [zscore[k1][k2] for k1 in zscore.keys() for k2 in zscore[k1].keys()]
            data = [x for x in data if np.isnan(x)==False]
            mu = np.mean(data)
            zscores[k] = mu
        return zscores

    def _get_mean_and_sigma_null_parameters(self):
        """Retrieve mean and sigma for 32 combi from a null AUC distribution"""
        filename = 'sc2b_null_mu_sigma.json'
        filename = self.getpath_data(filename)
        # a dict contain dict (phospho) of dict (phospho) of dict (mu,sigma)

        with open(filename) as fh:
            mu_sigma = json.load(fh)

        mu = {}
        sigma = {}
        for c in self.inhibitors:
            mu[c] = {}
            sigma[c] = {}
            phosphos = self.rmse[0][c].keys()
            for p in phosphos:
                if p == c:
                    mu[c][p] = np.nan
                    sigma[c][p] = np.nan
                else:
                    mu[c][p] = mu_sigma[c][p]['mu']
                    sigma[c][p] = mu_sigma[c][p]['sigma']
        return mu, sigma

    def _get_zscores(self, rmses=None):
        mu, sigma = self._get_mean_and_sigma_null_parameters()
        zscores = {}
        for c in self.inhibitors:
            zscores[c] = {}
            phosphos = self.rmse[0][c].keys()
            for p in phosphos:
                if c!=p:
                    if rmses!=None:
                        zscores[c][p] = (rmses[c][p] - mu[c][p])/sigma[c][p]
                    else:
                        zscores[c][p] = (self.rmse[c][p] - mu[c][p])/sigma[c][p]
                else:
                    zscores[c][p] = np.nan
        return zscores


def sc2a_null(N=100, tag=""):
    s = HPNScoringPrediction()
    res = s.get_null(N, tag=tag)
    return res


def sc2b_null(N=100):
    s = HPNScoringPredictionInsilico()
    res = s.get_null(N)
    return res