Source code for dreamtools.dream5.D5C2.scoring

# -*- python -*-
#
#  This file is part of DREAMTools software
#
#  Copyright (c) 2014-2015 - EBI-EMBL
#
#  File author(s): Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: http://github.org/dreamtools
#
##############################################################################
"""D5C2 challenge scoring functions

Based on TF_web.pl (perl version) provided by Raquel Norel (Columbia
University/IBM) also used by the web server http://www.ebi.ac.uk/saezrodriguez-srv/d5c2/cgi-bin/TF_web.pl

This implementation is independent of the web server.
"""
import os
from os.path import join as pj
import zipfile
from io import BytesIO
import collections
import numpy as np
import pandas as pd
import tempfile

from easydev import progress_bar
from dreamtools.core.ziptools import ZIP
from dreamtools.core.challenge import Challenge
from dreamtools.core.rocs import ROCDiscovery


[docs]class D5C2(Challenge):
    """A class dedicated to D5C2 challenge

    ::

        from dreamtools import D5C2
        s = D5C2()

        # You can get a template from www.synapse.org page (you need 
        # to register)
        filename = s.download_template()
        s.score(filename) # takes about 5 minutes
        s.get_table()
        s.plot()

    Data and templates are downloaded from Synapse. You must have a login.

    """
    def __init__(self, verbose=True, download=True, tmpdir=None, Ntf=66, **kargs):
        """.. rubric:: constructor

        :param Ntf: not to be used. Used for fast testing and debugging
        :param tmpdir: a local temporary file if provided.
        """
        super(D5C2, self).__init__('D5C2', verbose, download, **kargs)
        self.Ntf = Ntf
        self.tmpdir = tmpdir # directory where to save the results

        self._dvs = {}
        self._dvps = {}
        self._probes = {}

[docs]    def score(self, prediction_file):
        """Compute all results and compare prediction with official participants

        This scoring function can take a long time (about 5-10 minutes).
        """

        self.init() # this provides a temporary file
        self.prediction_file = prediction_file
        print('Loading user data set (step 1 out of 5)')
        self._loading_user_data()

        print('Loading the gold standard and user prediction. Takes a few seconds (step 2 out 5')
        self.download_all_data()

        print('\nProcessing\nSplitting data sets (step 3 out of 5)')
        self._split_data()
        print('\nComputing probes (step 4 out of 5)')
        self._preprocessing()
        print('\nComputing performances (step 5 out of 5)')
        self._processing()

        results = self.get_table().ix[20]
        return results

[docs]    def init(self):
        """Creates the temporary directory and the sub directories.

        Behaviour differs whether the directory was provided
        in the constructor or not.
        """
        if self.tmpdir is None:
            self.tmpdir = tempfile.mkdtemp()
        else:
            try:
                os.mkdir(self.tmpdir)
            except:
                pass

        for directory in ['Data', 'Out']:
            this = pj(self.tmpdir, directory)
            if os.path.exists(this) is False:
                os.mkdir(this)

[docs]    def download_goldstandard(self):

        filename = self._download_data('DREAM5_GoldStandard_probes.zip', 
                'syn2898469')
        print("Note: to get a valid gold standard, \n")
        print(" 1 - Unzip the file\n")
        print(" 2 - Extract the first, second, fourth column\n")
        print(" 3 - Replace spaces by tabs\n")
        print(" 4 - zip the file\n")
        return filename

[docs]    def download_all_data(self):
        """Download all large data sets from Synapse"""
        pb = progress_bar(5)
        # load the large gold standard file from D5C2 synapse main page
        filename = self._download_data('DREAM5_GoldStandard_probes.zip', 
                'syn2898469')
        pb.animate(1)
        z = ZIP()
        z.loadZIPFile(filename)
        data = z.read('Answers.txt')
        self.gs = pd.read_csv(BytesIO(data), sep='\t')

        # download 4 other filenames from dreamtools synapse project
        self._download_data('all_8mers.txt', 'syn4483185')
        pb.animate(2)
        self._download_data('8mers_gs.txt', 'syn4483187')
        pb.animate(3)
        self._download_data('probe35_gs.txt', 'syn4483184')
        pb.animate(4)
        self._download_data('probes35.txt', 'syn4483183')
        pb.animate(5)

[docs]    def download_template(self):
        """Download a template from synapse into ~/config/dreamtools/dream5/D5C2

        :return: filename and its full path
        """
        filename = self._download_data('templates.txt.gz', 'syn4483192')
        return filename

    def _split_data(self, precision=6):
        """precision is to get same results as in the original perl script"""
        #FIXME::

        mask = self.gs.Flag == 0
        self.user_data_clean = self.user_data[mask].copy()
        print('Splitting the user data set and removing flagged data (%s out of %s)' % (self.gs.shape[0] - mask.sum(), self.gs.shape[0]))
        self.gs_clean = self.gs[mask].copy()
        # local aliases
        gs = self.gs_clean
        user_data = self.user_data_clean

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            this_tf = 'TF_%s' % tf_index
            tf_gs = gs.query("Id == @this_tf").Answer
            tf_user = user_data.query("TF_Id == @this_tf").Signal_Mean
            df = pd.concat([tf_gs, tf_user], axis=1)
            df.to_csv(self._setfile(tf_index, 'Data'), index=False, sep='\t',
                      header=False, float_format="%f")
            pb.animate(tf_index)

    def _loading_user_data(self):
        """Get the file from the form, save it, decompress it."""

        # could be either gz or zip
        import mimetypes
        itemtype = mimetypes.guess_type(self.prediction_file)[1]

        if itemtype == 'gzip':
            import gzip
            fh = gzip.open(self.prediction_file, 'rb')
            data = fh.read()
            fh.close()
        elif itemtype == 'zip':
            z = zipfile.ZipFile(self.prediction_file)
            assert len(z.filelist) == 1, "zipped archive should contain only 1 file"
            # extract in byte
            data = z.read(z.filelist[0])
            self.prediction_file_unzipped = 'tmp_' + z.filelist[0].filename
        else:
            raise IOError("input file must be gzipped or zipped")

        df = pd.read_csv(BytesIO(data), sep='\t');# engine='python')
        self.user_data = df

    def _preprocessing(self):
        """Create temporary files for before further processing

        :return: nothing
        """
        # Read file octomers gold standard
        filename = self.directory + os.sep + '8mers_gs.txt'
        self.octomers_gs = pd.read_csv(filename, sep='\t', header=None)

        # Read file octomers
        filename = self.directory + os.sep + 'all_8mers.txt'
        self.octomers = pd.read_csv(filename, sep='\t', header=None)  # contains reverse complemtn
        self.octomers.columns = ['octomer','octomerRC']

        # Read probes gs
        filename = self.directory + os.sep + 'probe35_gs.txt'
        self.probes_gs = pd.read_csv(filename, header=None, sep='\t')
        self.probes_gs.columns = ['Id', 'Sequence']

        # reads probes (sequences)
        print('Reading probes')
        filename = self.directory + os.sep + 'probes35.txt'
        # just one column so no need for a separator
        probes = pd.read_csv(filename)

        # Extract information (first and third column of pred.txt)
        df = self.user_data[['TF_Id', 'Signal_Mean']].copy()
        df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6))

        # data.txt is paste of probes35.txt and val.txt
        data = pd.concat([probes, df], axis=1)

        # Creates probes/TF_1.dat that contains the sequence from the GS and the answer from the user
        # for each TF
        print('Creating probes/TF_1.csv + sorting')
        pb = progress_bar(self.Ntf, interval=1)
        for i in range(1, self.Ntf+1):
            # could use a groupby here ? faster  maybe
            tag = 'TF_%s' % i
            sequence = data[['Sequence']].ix[self.gs.Id==tag]
            answer = data.Signal_Mean[data.TF_Id == tag]
            df = pd.concat([sequence, answer], axis=1)
            try:
                df.sort_values(by=['Signal_Mean', 'Sequence'], 
                    ascending=[False, False], inplace=True)
            except:
                df.sort(columns=['Signal_Mean', 'Sequence'], 
                    ascending=[False, False], inplace=True)

            df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6))

            self._probes[i] = df
            pb.animate(i)

    def _setfile(self, index, directory):
        return self.tmpdir + os.sep + directory + os.sep + 'TF_%s' % index + '.csv'

    def _processing(self):
        """

        :return:
        """
        ########################################  1 Create the Out/TF_XX.dat files
        octomers = self.octomers.octomer
        octomersRC = self.octomers.octomerRC
        mapping1  = dict([(k,v) for k,v in 
            zip(octomers.values, octomersRC.values)])
        mapping2  = dict([(k,v) for k,v in 
            zip(octomersRC.values, octomers.values)])
        keys = tuple(sorted(octomers.values))

        lm = set(octomers.values)

        pb = progress_bar(self.Ntf, interval=1)
        pb.animate(0)
        for tf_index in range(1, self.Ntf + 1):
            tf = self._probes[tf_index]
            tf.columns = ['Sequence', 'Score']
            ids = collections.defaultdict(list)
            ## TODO: most of the time is spent in "for curR in generator" loop
        
            for seq, score in zip(tf.Sequence, tf.Score):
                # scan the sequence by chunk of octomers using a generator
                # for speed (although gain is small)
                generator = (seq[i:i+8] for i in range(0, 28))
                #for curR in generator:
                #    if curR not in mapping1.keys():
                #        curR = mapping2[curR]
                #    ids[curR].append(score)
                # Using a set does help speeding up the code
                for curR in generator:
                    if curR not in lm:
                        curR = mapping2[curR]
                    ids[curR].append(score)
            # now let us build the new dataframe for the indices found
            df = pd.DataFrame({0: [k for k in  ids.keys()],
                               1: [np.median(v) for v in ids.values()]})
            try:
                df.sort_values(by=[1, 0], ascending=[False, False], 
                        inplace=True)
            except:
                df.sort(columns=[1, 0], ascending=[False, False], inplace=True)
            df[1] = df[1].map(lambda x: round(x, 6))

            df.to_csv(self._setfile(tf_index, 'Out'), 
                    sep=' ', index=False, header=None, float_format="%.6f")
            pb.animate(tf_index)
        ################################################# 2 create the DVP

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1,self.Ntf+1):
            tag = 'TF_%s' % tf_index
            tf_probes = list(self.probes_gs.ix[self.probes_gs.groupby('Id').groups[tag]].Sequence)

            tf = self._probes[tf_index]
            dv = tf.Sequence.apply(lambda x: x in tf_probes).astype(int)
            self._dvps[tf_index] = dv
            pb.animate(tf_index)
        print("")

        ########################################################## DV
        gs_octomers = self.octomers_gs.copy()
        gs_octomers.columns = ['id', 'octomer']
        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1,self.Ntf+1):
            tag = 'TF_%s' % tf_index
            tf_octomers = list(gs_octomers.ix[gs_octomers.groupby('id').groups[tag]].octomer)
            tf = pd.read_csv(self._setfile(tf_index, "Out"), sep=" ",
                             header=None)
            tf.columns = ['Octomer', 'Score']
            dv = tf.Octomer.apply(lambda x: x in tf_octomers).astype(int)

            # Stores the dataframe
            self._dvs[tf_index] = dv
            pb.animate(tf_index)

[docs]    def compute_statistics(self):
        """Returns final results of the user predcition

        :return: a dataframe with various metrics for each transcription factor.

        Must call :meth:`score` before.

        """
        data = {'Pearson': [],
                'Spearman': [],
                'Pearson_Log': [],
                "AUROC_8mer": [],
                "AUPR_8mer": [],
                "AUROC_probe": [],
                "AUPR_probe": []}

        pb = progress_bar(self.Ntf, interval=1)
        for tf_index in range(1, self.Ntf + 1):
            dfdata = pd.read_csv(self._setfile(tf_index, "Data"), sep='\t', header=None)
            pearson = dfdata.corr('pearson').ix[0,1]
            spearman = dfdata.corr('spearman').ix[0,1]
            pearsonLog = np.log10(dfdata).corr('pearson').ix[0,1]

            data['Pearson'].append(pearson)
            data['Pearson_Log'].append(pearsonLog)
            data['Spearman'].append(spearman)

            dvdata = self._dvs[tf_index]

            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_8mer'].append(auroc)
            data['AUPR_8mer'].append(aupr)

            dvdata = self._dvps[tf_index]
            r = ROCDiscovery(dvdata.values)
            rocdata = r.get_statistics()
            auroc = r.compute_auc(roc=rocdata)
            aupr = r.compute_aupr(roc=rocdata)
            data['AUROC_probe'].append(auroc)
            data['AUPR_probe'].append(aupr)
            pb.animate(tf_index)

        df = pd.DataFrame(data)
        df = df[['Pearson', u'Spearman', u'Pearson_Log', u'AUROC_8mer', 
            u'AUPR_8mer', u'AUROC_probe', u'AUPR_probe']]

        return df

[docs]    def get_table(self):
        """Return table with user results from the user and participants

        There are 14 participants as in the Leaderboard found here
        https://www.synapse.org/#!Synapse:syn2887863/wiki/72188


        :return: a dataframe with different metrics showing performance of the submission
            with respect to other participants.


        ::

            table = s.get_table()
            with open('test.html', 'w') as fh:
                fh.write(table.to_html(index=False))

        """
        userdf = self.compute_statistics()
        userdf = userdf.mean().to_frame().T
        userdf.index = [20] # there are 20 participants, let us add this user as the 21st
        userdf = userdf[['Pearson', 'Pearson_Log', 'Spearman', 
            'AUROC_8mer', 'AUPR_8mer']]
        userdf['Team'] = 'Your team'
        userdf['Model type'] = 'Your model'

        filename = os.sep.join([self.classpath, "data", "d5c2_data_table.csv"])
        participants = pd.read_csv(filename, sep='\t', index_col=0)
        table = pd.concat([participants, userdf])

        # compute ranks based on those columns. Using method first to 
        # be in agreement with the server
        rank_columns = ['Pearson', 'Pearson_Log', 'Spearman', 'AUPR_8mer', 'AUROC_8mer']
        ranks = table[rank_columns]
        mean_ranks = ranks.rank(ascending=False, method='first').mean(axis=1)
        table['Final Rank (average)'] = mean_ranks.values
        table['Final Rank'] = table['Final Rank (average)'].rank(method='first')
        table = table[['Team', 'Model type', 'Final Rank', 'Final Rank (average)'] + rank_columns]
        try:
            table = table.sort_values(by=['Final Rank'])
        except:
            table = table.sort(columns=['Final Rank'])

        return table

    def _get_table(self):
        userdf = self.compute_statistics()
        userdf = userdf.mean().to_frame().T
        userdf.index = [20] # there are 20 participants, let us add this user as the 21st

        # load all data from participants for comparison
        filename = os.sep.join([self.classpath, "data", "d5c2_data_plot.csv"])
        participants = pd.read_csv(filename, sep='\t', )

        df = pd.concat([participants, userdf])
        return (df, userdf)

[docs]    def plot(self, fontsize=16):
        """Show the user prediction compare to 20 other participants"""
        df, userdf = self._get_table()

        import pylab
        pylab.clf();
        pylab.subplot(2,2,1)
        pylab.plot(df.AUROC_8mer, df.AUPR_8mer, 
                marker='+', color='k', lw=0, markersize=10)
        pylab.plot(userdf.AUROC_8mer, userdf.AUPR_8mer, 
                marker='s', color='b', lw=0, markersize=10)
        pylab.xlim([0, 1])
        pylab.ylim([0, 1])
        pylab.xlabel('AUROC octamers', fontsize=fontsize)
        pylab.ylabel('AURPR octamers', fontsize=fontsize)
        pylab.grid()

        pylab.subplot(2, 2, 2)
        pylab.plot(df.AUROC_probe, df.AUPR_probe, 
                marker='+', color='k', lw=0, markersize=10)
        pylab.plot(userdf.AUROC_probe, userdf.AUPR_probe, 
                marker='s', color='b', lw=0, markersize=10)
        pylab.xlim([0, 1])
        pylab.ylim([0, 1])
        pylab.xlabel('AUROC probes', fontsize=fontsize)
        pylab.ylabel('AURPR probes', fontsize=fontsize)
        pylab.grid()

        pylab.subplot(2,2,3)
        pylab.plot(df.Pearson, df.Spearman, 
                marker='+', color='k', lw=0, markersize=10)
        pylab.plot(userdf.Pearson, userdf.Spearman, 
                marker='s', color='b', lw=0, markersize=10)
        pylab.xlim([0, 1])
        pylab.ylim([0, 1])
        pylab.xlabel('Pearson probes', fontsize=fontsize)
        pylab.ylabel('Spearman probes', fontsize=fontsize)
        pylab.grid()

        pylab.subplot(2, 2, 4)
        pylab.plot(df.Pearson, df.Pearson_Log, 
                marker='+', color='k', lw=0, markersize=10)
        pylab.plot(userdf.Pearson, userdf.Pearson_Log, 
                marker='s', color='b', lw=0, markersize=10)
        pylab.xlim([0, 1])
        pylab.ylim([0, 1])
        pylab.xlabel('Pearson probes', fontsize=fontsize)
        pylab.ylabel('Log Pearson probes', fontsize=fontsize)
        pylab.grid()

[docs]    def cleanup(self):
        """Remove the temporary directory"""
        import shutil
        shutil.rmtree(self.tmpdir)