From 6cac6ebd5fd870257dd7f77b73af94d5a2b2a6d9 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sun, 21 May 2017 21:16:30 -0400 Subject: [PATCH] updates --- .../class1_affinity_predictor.py | 96 ++++++++++--------- .../scoring.py | 2 +- test/test_class1_binding_predictor_A0205.py | 64 +++++++++---- test/test_ensemble.py | 35 ++++--- 4 files changed, 110 insertions(+), 87 deletions(-) rename mhcflurry/{class1_affinity_prediction => }/scoring.py (95%) diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py index 223ff08a..42305241 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py @@ -3,6 +3,7 @@ import time import hashlib import json from os.path import join, exists +from six import string_types import numpy import pandas @@ -80,7 +81,7 @@ class Class1AffinityPredictor(object): def weights_path(models_dir, model_name): return join( models_dir, - "%s.%s" % ( + "weights_%s.%s" % ( model_name, Class1NeuralNetwork.weights_filename_extension)) @@ -228,66 +229,67 @@ class Class1AffinityPredictor(object): verbose=verbose) yield model - def predict( + def predict(self, peptides, alleles=None, allele=None): + df = self.predict_to_dataframe( + peptides=peptides, + alleles=alleles, + allele=allele + ) + return df.prediction.values + + def predict_to_dataframe( self, peptides, - alleles, - include_mean=True, - include_peptides_and_alleles=True): - input_df = pandas.DataFrame({ + alleles=None, + allele=None, + include_individual_model_predictions=False): + if isinstance(peptides, string_types): + raise TypeError("peptides must be a list or array, not a string") + if isinstance(alleles, string_types): + raise TypeError("alleles must be a list or array, not a string") + if allele is not None: + if alleles is not None: + raise ValueError("Specify exactly one of allele or alleles") + alleles = [allele] * len(peptides) + + df = pandas.DataFrame({ 'peptide': peptides, 'allele': alleles, }) - input_df["allele"] = input_df.allele.map( + df["normalized_allele"] = input_df.allele.map( mhcnames.normalize_allele_name) - result_dataframes = [] - if self.class1_pan_allele_models: - allele_pseudosequences = input_df.allele.map( + allele_pseudosequences = df.normalized_allele.map( self.allele_to_pseudosequence) encodable_peptides = EncodableSequences.create( - input_df.peptide.values) - for model in self.class1_pan_allele_models: - result_df = pandas.DataFrame( - model.predict( - encodable_peptides, - allele_pseudosequences=allele_pseudosequences)) - result_dataframes.append(result_df) + df.peptide.values) + for (i, model) in enumerate(self.class1_pan_allele_models): + df["model_pan_%d" % i] = model.predict( + encodable_peptides, + allele_pseudosequences=allele_pseudosequences) if self.allele_to_allele_specific_models: - for allele in input_df.allele.unique(): - mask = (input_df.allele == allele).values + for allele in df.normalized_allele.unique(): + mask = (df.normalized_allele == allele).values allele_peptides = EncodableSequences.create( - input_df.ix[mask].peptide.values) + df.ix[mask].peptide.values) models = self.allele_to_allele_specific_models.get(allele, []) - for model in models: - result_df = pandas.DataFrame( - model.predict(allele_peptides), - index=input_df.index[mask].values) - result_dataframes.append(result_df) - - model_predictions = pandas.Panel( - dict(enumerate(result_dataframes)), - major_axis=input_df.index) + for (i, model) in enumerate(models): + df.loc[mask, "model_single_%d" % i] = model.predict( + allele_peptides) # Geometric mean - log_means = numpy.log(model_predictions).mean(0) - first_columns = [] - if include_mean: - log_means["mean"] = log_means.mean(1) - first_columns.append("mean") - - result = numpy.exp(log_means) - - if include_peptides_and_alleles: - result["peptide"] = input_df.peptide.values - result["allele"] = input_df.allele.values - first_columns.append("allele") - first_columns.append("peptide") - - assert len(result) == len(peptides), result.shape - return result[ - list(reversed(first_columns)) + - [c for c in result.columns if c not in first_columns] + df_predictions = df[ + [c for c in df.columns if c.startswith("model_")] ] + log_means = numpy.log(df_predictions).mean(1) + df["prediction"] = numpy.exp(log_means) + df["prediction_low"] = numpy.exp(log_means.quantile(q=.05, axis=1)) + df["prediction_high"] = numpy.exp(log_means.quantile(q=.05, axis=1)) + + if include_individual_model_predictions: + return df + return df[ + [c for c in df.columns if c not in df_predictions.columns] + ] \ No newline at end of file diff --git a/mhcflurry/class1_affinity_prediction/scoring.py b/mhcflurry/scoring.py similarity index 95% rename from mhcflurry/class1_affinity_prediction/scoring.py rename to mhcflurry/scoring.py index 12cccc7f..ad904689 100644 --- a/mhcflurry/class1_affinity_prediction/scoring.py +++ b/mhcflurry/scoring.py @@ -8,7 +8,7 @@ import sklearn import numpy import scipy -from ..regression_target import ic50_to_regression_target +from mhcflurry.regression_target import ic50_to_regression_target def make_scores( diff --git a/test/test_class1_binding_predictor_A0205.py b/test/test_class1_binding_predictor_A0205.py index 9010e99a..fc486469 100644 --- a/test/test_class1_binding_predictor_A0205.py +++ b/test/test_class1_binding_predictor_A0205.py @@ -2,35 +2,39 @@ import numpy import pandas numpy.random.seed(0) -from mhcflurry import Class1NeuralNetwork +from mhcflurry import Class1NeuralNetwork, Class1AffinityPredictor from nose.tools import eq_ from numpy import testing from mhcflurry.downloads import get_path +allele = "HLA-A*02:05" -def test_class1_binding_predictor_A0205_training_accuracy(): - df = pandas.read_csv( +df = pandas.read_csv( get_path( "data_curated", "curated_training_data.csv.bz2")) - df = df.ix[df.allele == "HLA-A*02:05"] - df = df.ix[ - df.peptide.str.len() == 9 - ] - df = df.ix[ - df.measurement_type == "quantitative" - ] - df = df.ix[ - df.measurement_source == "kim2014" - ] - - predictor = Class1NeuralNetwork( - activation="tanh", - layer_sizes=[64], - max_epochs=1000, # Memorize the dataset. - early_stopping=False, - dropout_probability=0.0) +df = df.ix[df.allele == allele] +df = df.ix[ + df.peptide.str.len() == 9 +] +df = df.ix[ + df.measurement_type == "quantitative" +] +df = df.ix[ + df.measurement_source == "kim2014" +] + +hyperparameters = dict( + activation="tanh", + layer_sizes=[64], + max_epochs=1000, # Memorize the dataset. + early_stopping=False, + dropout_probability=0.0) + + +def test_class1_neural_network_A0205_training_accuracy(): + predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit(df.peptide.values, df.measurement_value.values) ic50_pred = predictor.predict(df.peptide.values) ic50_true = df.measurement_value.values @@ -40,3 +44,23 @@ def test_class1_binding_predictor_A0205_training_accuracy(): numpy.log(ic50_true), rtol=0.2, atol=0.2) + + +def test_class1_neural_network_A0205_training_accuracy(): + predictor = Class1AffinityPredictor() + predictor.fit_allele_specific_predictors( + n_models=1, + architecture_hyperparameters=hyperparameters, + allele=allele, + peptides=df.peptide.values, + affinities=df.measurement_value.values, + ) + ic50_pred = predictor.predict(df.peptide.values, allele=allele) + ic50_true = df.measurement_value.values + eq_(len(ic50_pred), len(ic50_true)) + testing.assert_allclose( + numpy.log(ic50_pred), + numpy.log(ic50_true), + rtol=0.2, + atol=0.2) + diff --git a/test/test_ensemble.py b/test/test_ensemble.py index 210989e3..127af7f1 100644 --- a/test/test_ensemble.py +++ b/test/test_ensemble.py @@ -1,31 +1,28 @@ -import tempfile -import shutil -import os -import time import cProfile - import json -from os.path import join +import os +import shutil +import tempfile +import time from os import mkdir +from os.path import join -from numpy.testing import assert_allclose, assert_equal import numpy -from nose.tools import eq_ - -from . import make_random_peptides - -from mhcflurry.class1_affinity_prediction import scoring -from mhcflurry.measurement_collection import MeasurementCollection -from mhcflurry.class1_allele_specific_ensemble import train_command +from mhcflurry import scoring from mhcflurry.affinity_measurement_dataset import AffinityMeasurementDataset -from mhcflurry.downloads import get_path -from mhcflurry.amino_acid import common_amino_acid_letters +from mhcflurry.class1_allele_specific_ensemble import train_command from mhcflurry \ .class1_allele_specific_ensemble \ .class1_ensemble_multi_allele_predictor import ( - Class1EnsembleMultiAllelePredictor, - get_downloaded_predictor, - HYPERPARAMETER_DEFAULTS) + Class1EnsembleMultiAllelePredictor, + get_downloaded_predictor, + HYPERPARAMETER_DEFAULTS) +from mhcflurry.downloads import get_path +from mhcflurry.measurement_collection import MeasurementCollection +from nose.tools import eq_ +from numpy.testing import assert_allclose, assert_equal + +from . import make_random_peptides def test_single_allele(): -- GitLab