Source code for dreamtools.dream6.D6C3.scoring

"""D6C3 scoring function

Based on Pablo's Meyer Matlab code.
"""
import os
from dreamtools.core.challenge import Challenge
import pandas as pd
import numpy as np
import glob

[docs]class D6C3(Challenge): """A class dedicated to D6C3 challenge :: from dreamtools import D6C3 s = D6C3() filename = s.download_template() s.score(filename) Absolute score in the Pearson coeff but other scores such as chi-square and rank are based on the 21st participants. Pearson and spearman gives same values as in final LB but X2 and R2 are slightly different. Same results as in the original matlab scripts so the different with the LB is probably coming fron a different set of predictions files, which is stored in ./data/predictions and was found in http://genome.cshlp.org/content/23/11/1928/suppl/DC1 The final score in the official leaderboard computed the p-values for each score (chi-square, r-square, spearman and pearson correlation coefficient) and took -0.25 log ( product of p-values) as the final score. """ def __init__(self, verbose=True, download=True, **kargs): """.. rubric:: constructor """ super(D6C3, self).__init__('D6C3', verbose, download, **kargs) self.sub_challenges = []
[docs] def score(self, filename): self.read_all_participants() gs_filename = self.download_goldstandard() self.goldstandard = pd.read_csv(gs_filename, sep='\t', header=None) self.goldstandard.set_index(0, inplace=True) self.prediction = pd.read_csv(filename, sep='\t', header=None) self.prediction.set_index(0, inplace=True) # some aliaes pred = self.prediction gs = self.goldstandard Spred = self.prediction.sum() Sgs = self.goldstandard.sum() # Compute the Pearson coeff d1 = ((pred - Spred/53.)**2).sum() # variance participant d2 = ((gs - Sgs/53.)**2).sum() # variance GS Cp = (pred * gs).sum() - Spred * Sgs / 53. Cp /= np.sqrt(d1*d2) Cp = Cp.values[0] # using Scipy gives also the p-values # scipy.stats.pearsonr(s.prediction, s.goldstandard) import scipy.stats Sp = scipy.stats.spearmanr(pred, gs)[0] # could replace code above to compute pearson by this code: #Cp2 = scipy.stats.pearsonr(pred,gs)[0][0] # The ChiSqaure # Same results as Kelly's score but different from LB on synapse chi = 0 for j in range(0,53): num = (pred.ix[j] - gs.ix[j])**2 denom = np.mean((self.alldata.ix[j] - gs.ix[j].values)**2) chi += num / denom tiedrank = self.goldstandard.rank() # if tied, average ranks # R-square r2 = 0 pt = 21 allranks = self.alldata.rank() for i in range(0, 53): indx = pred.rank() rn1 = indx.ix[i] - tiedrank.ix[i] rs2 = 0 for j in range(0, pt): rd1 = allranks.ix[i,j] - tiedrank.ix[i] rs2 = rs2 + rd1*rd1 rs2 = rs2 / float(pt) r2 = r2 + (rn1*rn1) / rs2 results = pd.Series() results['chi2'] = chi.values[0] results['R-square'] = r2.values[0] results['Spearman(Sp)'] = Sp results['Pearson(Cp)'] = Cp return {'results':results}
[docs] def read_all_participants(self): path = self._pj([self.classpath, 'data', 'predictions']) filenames = glob.glob(path + os.sep + '*txt') assert len(filenames) == 21 data = [pd.read_csv(filenames[i], sep='\t', header=None)[1] for i in range(0,21)] self.alldata = pd.DataFrame(data).T self.alldata.columns = range(0,21)
[docs] def download_template(self): return self.getpath_template('D6C3_template.txt')
[docs] def download_goldstandard(self): return self.getpath_gs('D6C3_goldstandard.txt')