Source code for dreamtools.dream6.D6C4.scoring

"""


Original scoring function: Kelly Norel
"""
import os
from dreamtools.core.challenge import Challenge
import pandas as pd


__all__ = ['D6C4']


[docs]class D6C4(Challenge): """A class dedicated to D6C4 challenge :: from dreamtools import D6C4 s = D6C4() filename = s.download_template() s.score(filename) Data and templates are downloaded from Synapse. You must have a login. """ def __init__(self, verbose=True, download=True, **kargs): """.. rubric:: constructor""" super(D6C4, self).__init__('D6C4', verbose, download, **kargs) self._init() self.sub_challenges = []
[docs] def download_template(self): # should return full path to a template file filename = self.getpath_template('D6C4_template.txt') return filename
[docs] def download_goldstandard(self): # should return full path to a gold standard file filename = self.getpath_gs('D6C4_goldstandard.txt') return filename
def _init(self): if self._standalone is True: return # Reads the test file with missing values df = pd.read_csv(self._pj([self.classpath, 'data', 'AMLTraining.csv']), index_col=0) df.fillna('none', inplace=True) testset = df.SampleNumber[df.Label == 'none'].unique() testsetIndex = [df.SampleNumber[df.SampleNumber == x].index[0] for x in testset] # Read the test with all values # let us set the index with first column (FCSFileName) filename = self._pj([self.classpath, 'goldstandard', 'AML.csv']) gs = pd.read_csv(filename, index_col=0, sep=',') # let us keep only the relevant data (test set) gs = gs.ix[testsetIndex] assert sum(gs.Label == 'aml') == 20 # we can drop the tube number gs = gs[['SampleNumber', 'Label']] gs = gs.replace('normal', 0) gs = gs.replace('aml', 1) self.gs = gs def _read_submission(self, filename): sub = pd.read_csv(filename, sep='\t', header=None, names=['SampleNumber', 'Label']) return sub
[docs] def score(self, filename): sub = self._read_submission(filename) df = pd.merge(self.gs, sub, on='SampleNumber') df.drop(['SampleNumber'], axis=1, inplace=True) # Label_x is the class vector # label_y is the score vector self.df = df results = {} results['pearson'] = df.corr()['Label_x']['Label_y'] # The Precision of the predictions, defined as the # fraction of correct AML patients amongst the first 20 predictions. try: prec = df.sort_values(by='Label_y', ascending=False)['Label_x'][0:20] except: prec = df.sort('Label_y', ascending=False)['Label_x'][0:20] results['precision'] = sum(prec)/20. # The Recall of the predictions, defined as the proportion of AML # patients in the first 20 predictions out of all the AML # patients in the cohort. # TODO seems to be identicl to prec # at least on the official LB # https://www.synapse.org/#!Synapse:syn2887788/wiki/72181 results['recall'] = sum(prec)/20. #MCC: The Matthews Correlation Coefficient is a measure of the quality #of binary classifications. It takes into account true and false #positives and negatives and is generally regarded as a balanced measure #which can be used even if the classes are of very different sizes. For #its mathematical definition see #http://en.wikipedia.org/wiki/Matthews_correlation_coefficient from dreamtools.core.rocs import MCC N = 180 T = 20 TP = sum(prec) FP = T - TP FN = FP # symmetric TN = N - T - FP results['MCC'] = MCC(TP, TN, FP, FN) # Finally the JSC (jaccard) try: from sklearn.metrics import jaccard_similarity_score as JSC try: true = df.sort_values(by='Label_y', ascending=False)['Label_x'].values pred = df.sort_values(by='Label_y', ascending=False)['Label_y'].values except: true = df.sort('Label_y', ascending=False)['Label_x'].values pred = df.sort('Label_y', ascending=False)['Label_y'].values results['JSC'] = JSC(true[0:20], pred.round()[0:20]) # TODO: this is not exactly the same as in the LB except: print('Install scikit-learn for the Jaccard similarity') return results