Source code for dreamtools.dream9.D9C1.scoring


"""

Based on https://github.com/Sage-Bionetworks/DREAM9_Broad_Challenge_Scoring/
and instructions and communications from Mehmet Gonen.

Original code in R. Translated to Python by Thomas Cokelaer

"""
import os
from io import BytesIO

from dreamtools.core.challenge import Challenge
from dreamtools.core.ziptools import ZIP

import pandas as pd


[docs]class D9C1(Challenge): """A class dedicated to D9C1 challenge :: from dreamtools import D9C1 s = D9C1() filename = s.download_template() s.score(filename) For consistency, all gene essentiality and genomic data files will be given in the same gct file format. Briefly, this means: The first and second lines contains the version string and numbers indicating the size of the data table that is contained in the remainder of the file:: #1.2 (# of data rows) (tab) (# of data columns) The third line contains a list of identifiers for the samples associated with each of the columns in the remainder of the file:: Name (tab) Description (tab) (sample 1 name) (tab) (sample 2 name) (tab) ... (sample N name) And the remainder of the data file contains data for each of the genes. There is one line for each gene and one column for each of the samples. The first two fields in the line contain name and descriptions for the genes (names and descriptions can contain spaces since fields are separated by tabs). The number of lines should agree with the number of data rows specified on line 2.: (gene name) (tab) (gene description) (tab) (col 1 data) (tab) (col 2 data) (tab) ... (col N data) """ def __init__(self, verbose=True, download=True, **kargs): """.. rubric:: constructor """ super(D9C1, self).__init__('D9C1', verbose, download, **kargs) self._init() self.sub_challenges = ['sc1', 'sc3', 'sc2'] def _init(self): if self._standalone is True: return # should download files from synapse if required. self._download_data('D9C1_goldstandard.gct.zip', 'syn4595275') # now unzip and read the gs on the go z = ZIP() z.loadZIPFile(self.get_pathname('D9C1_goldstandard.gct.zip')) data = z.read('D9C1_goldstandard.gct') self.goldstandard = pd.read_csv(BytesIO(data), sep='[ \t]', skiprows=2, engine='python') self.goldstandard.drop(['Description'], axis=1, inplace=True) self.goldstandard.set_index('Name', inplace=True) self.goldstandard.columns = [x.strip() for x in self.goldstandard.columns] # get template for SC1, SC2, SC3 self._download_data('D9C1_template_sc1.gct.zip', 'syn4595283') self.unzip('D9C1_template_sc1.gct.zip') self._download_data('D9C1_template_sc2.zip', 'syn4595587') self._download_data('D9C1_template_sc3.zip', 'syn4595588') # download gold standard for sc2 filename = self.getpath_gs( 'D9C1_goldstandard_sc2.txt') self.gs_priority = pd.read_csv(filename, sep='\t', header=None) def _read_gct(self, filename): if os.path.exists(filename): gct = pd.read_csv(filename, sep='[ \t]', skiprows=2, engine='python') else: gct = pd.read_csv(BytesIO(filename), sep='[ \t]', skiprows=2, engine='python') gct.drop(['Description'], axis=1, inplace=True) gct.set_index('Name', inplace=True) gct.columns = [x.strip() for x in gct.columns] return gct def _read_feature(self, filename): if os.path.exists(filename): return pd.read_csv(filename, sep='\t', header=None) else: return pd.read_csv(BytesIO(filename), sep='\t', header=None)
[docs] def score(self, filename, subname=None): self._check_subname(subname) if subname == 'sc1': return self._score_sc1(filename) elif subname == 'sc2': return self._score_sc2_sc3(filename) elif subname == 'sc3': return self._score_sc2_sc3(filename)
def _score_sc1(self, filename): self.prediction = self._read_gct(filename) assert all(self.goldstandard.columns == self.prediction.columns) assert self.goldstandard.shape == self.prediction.shape scores = [] # two aliases df1 = self.goldstandard df2 = self.prediction N = len(df1) # a bit slow (10 seconds) scores = [df1.ix[i].corr(df2.ix[i], method='spearman') for i in range(0, N)] final_score = sum(scores)/float(len(scores)) return {'score': final_score} def _score_sc2_sc3(self, filename): # looks like exactly same function in sc2/sc3 # feature file is not used either in original code ?! # this should be a zip file with 2 files. z = ZIP() z.loadZIPFile(filename) # there should be 2 files, one ending in gct one in txt assert len(z.filenames) == 2, "There should be 2 files in the zip archive" for filename in z.filenames: if filename.endswith('gct'): prediction = z.read(filename) prediction = self._read_gct(prediction) elif filename.endswith('txt'): feature = z.read(filename) feature = self._read_feature(feature) # first column should be the names feature.set_index(0, inplace=True) else: raise ValueError("there should be only 2 files. \n" + "One ending with gct extension (prediction)\n" + "One ending with txt extension (feature)") #assert feature.shape == (2647,10) assert prediction.shape == (2647,44) self.prediction = prediction self.feature = feature # in SC2, only a subset of predictive features (2647 out of 17.000) are used df1 = self.goldstandard.ix[self.gs_priority[0]] self.df1 = df1 scores = [] df2 = self.prediction N = len(df1) scores = [df2.ix[i].corr(df1.ix[i], method='spearman') for i in range(0, N)] self.scores = scores final_score = sum(scores)/float(len(scores)) return {'score': final_score}
[docs] def download_template(self, subname=None): # should return full path to a template file self._check_subname(subname) if subname == 'sc1': return self.get_pathname('D9C1_template_sc1.gct') elif subname == 'sc2': return self.get_pathname('D9C1_template_sc2.zip') elif subname == 'sc3': return self.get_pathname('D9C1_template_sc3.zip')
[docs] def download_goldstandard(self, subname=None): # should return full path to a gold standard file return self.get_pathname('D9C1_goldstandard.gct.zip')