diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 49eb75c50cd1827c40ad49c2d38d960101c6c43c..2650a5bb67c4d19d11d506318c2afe1115f7fc27 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -4,9 +4,6 @@ Train single allele models """ import sys import argparse -import json -import os -import pickle import pandas @@ -37,13 +34,20 @@ parser.add_argument( required=True, help="Result file") -QUALITATIVE_TO_AFFINITY = { - "Negative": 50000.0, - "Positive": 100.0, - "Positive-High": 50.0, - "Positive-Intermediate": 500.0, - "Positive-Low": 5000.0, +QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = { + "Negative": (20000.0, ">"), + "Positive": (500.0, "<"), + "Positive-High": (100.0, "<"), + "Positive-Intermediate": (1000.0, "<"), + "Positive-Low": (5000.0, "<"), } +QUALITATIVE_TO_AFFINITY = dict( + (key, value[0]) for (key, value) + in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items()) +QUALITATIVE_TO_INEQUALITY = dict( + (key, value[1]) for (key, value) + in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items()) + EXCLUDE_IEDB_ALLELES = [ "HLA class I", @@ -60,6 +64,7 @@ def load_data_kim2014(filename): True: "quantitative", False: "qualitative", }) + df["measurement_inequality"] = df.inequality df["original_allele"] = df.mhc df["peptide"] = df.sequence df["allele"] = df.mhc.map(normalize_allele_name) @@ -99,24 +104,28 @@ def load_data_iedb(iedb_csv, include_qualitative=True): quantitative = iedb_df.ix[iedb_df["Units"] == "nM"].copy() quantitative["measurement_type"] = "quantitative" + quantitative["measurement_inequality"] = "=" print("Quantitative measurements: %d" % len(quantitative)) qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy() qualitative["measurement_type"] = "qualitative" print("Qualitative measurements: %d" % len(qualitative)) - non_mass_spec_qualitative = qualitative.ix[ - (~qualitative["Method/Technique"].str.contains("mass spec")) - ].copy() - non_mass_spec_qualitative["Quantitative measurement"] = ( - non_mass_spec_qualitative["Qualitative Measure"].map( - QUALITATIVE_TO_AFFINITY)) - print("Qualitative measurements after dropping MS: %d" % ( - len(non_mass_spec_qualitative))) + #qualitative = qualitative.ix[ + # (~qualitative["Method/Technique"].str.contains("mass spec")) + #].copy() + + qualitative["Quantitative measurement"] = ( + qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY)) + qualitative["measurement_inequality"] = ( + qualitative["Qualitative Measure"].map(QUALITATIVE_TO_INEQUALITY)) + + print("Qualitative measurements (possibly after dropping MS): %d" % ( + len(qualitative))) iedb_df = pandas.concat( ( ([quantitative]) + - ([non_mass_spec_qualitative] if include_qualitative else [])), + ([qualitative] if include_qualitative else [])), ignore_index=True) print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts()) @@ -145,6 +154,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True): "Quantitative measurement" ].values train_data["measurement_source"] = iedb_df.category.values + train_data["measurement_inequality"] = iedb_df.measurement_inequality.values train_data["allele"] = iedb_df["allele"].values train_data["original_allele"] = iedb_df["Allele Name"].values @@ -181,6 +191,7 @@ def run(): "allele", "peptide", "measurement_value", + "measurement_inequality", "measurement_type", "measurement_source", "original_allele", diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index f56865ebf9efacb619791fde6b34db906199d045..ffd9744b1ba3c6d5077d12c0cc652936c118a412 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -14,12 +14,12 @@ import pandas from numpy.testing import assert_equal from six import string_types -from mhcflurry.class1_neural_network import Class1NeuralNetwork -from mhcflurry.common import random_peptides -from mhcflurry.downloads import get_path -from mhcflurry.encodable_sequences import EncodableSequences -from mhcflurry.percent_rank_transform import PercentRankTransform -from mhcflurry.regression_target import to_ic50 +from .class1_neural_network import Class1NeuralNetwork +from .common import random_peptides +from .downloads import get_path +from .encodable_sequences import EncodableSequences +from .percent_rank_transform import PercentRankTransform +from .regression_target import to_ic50 class Class1AffinityPredictor(object): @@ -368,7 +368,7 @@ class Class1AffinityPredictor(object): affinities, inequalities=None, models_dir_for_save=None, - verbose=1, + verbose=0, progress_preamble=""): """ Fit one or more allele specific predictors for a single allele using a @@ -425,7 +425,7 @@ class Class1AffinityPredictor(object): if n_architectures > 1: pieces.append( "Architecture {architecture_num:2d} / {n_architectures:2d}" - " (best so far: {best_num:2d)") + " (best so far: {best_num})") progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( ", ".join(pieces)) else: @@ -450,13 +450,12 @@ class Class1AffinityPredictor(object): verbose=verbose, progress_preamble=progress_preamble_template.format( user_progress_preamble=progress_preamble, - best_num=best_num, - model_num=model_num, + best_num="n/a" if best_num is None else best_num + 1, + model_num=model_num + 1, n_models=n_models, - architecture_num=architecture_num, + architecture_num=architecture_num + 1, n_architectures=n_architectures)) - if n_architectures > 1: # We require val_loss (i.e. a validation set) if we have # multiple architectures. @@ -464,11 +463,14 @@ class Class1AffinityPredictor(object): else: loss = None if loss is None or best_loss is None or best_loss > loss: - best_loss = best_loss + best_loss = loss best_num = architecture_num best_model = model del model + if n_architectures > 1: + print("Selected architecture %d." % (best_num + 1)) + model_name = self.model_name(allele, model_num) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index c8e674b2ab211df1417c76593f607bcd91190cbc..583499878c6adc152f6ce18171c911374ac342c5 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -444,9 +444,17 @@ class Class1NeuralNetwork(object): if sample_weights is not None: sample_weights = sample_weights[shuffle_permutation] - if self.hyperparameters['loss'] in LOSSES: + if self.hyperparameters['loss'].startswith("custom:"): # Using a custom loss that supports inequalities - loss_name_or_function = LOSSES[self.hyperparameters['loss']] + try: + loss_name_or_function = LOSSES[ + self.hyperparameters['loss'].replace("custom:", "") + ] + except KeyError: + raise ValueError( + "No such custom loss function: %s. Supported losses are: %s" % ( + self.hyperparameters['loss'], + ", ".join(["custom:" + loss_name for loss_name in LOSSES]))) loss_supports_inequalities = True else: # Using a regular keras loss. No inequalities supported. diff --git a/mhcflurry/loss_with_inequalities.py b/mhcflurry/loss_with_inequalities.py index a1efe0682b28b7c62028a5f4bdaf0d581ef7cb39..98a0383e36a75a820efef0bf3924e7ecabb6bc5a 100644 --- a/mhcflurry/loss_with_inequalities.py +++ b/mhcflurry/loss_with_inequalities.py @@ -25,14 +25,14 @@ between 4 - 5: from keras import backend as K import pandas -import numpy +from numpy import isnan, array LOSSES = {} def encode_y(y, inequalities=None): - y = numpy.array(y, dtype="float32") - if y.isnan().any(): + y = array(y, dtype="float32") + if isnan(y).any(): raise ValueError("y contains NaN") if (y > 1.0).any(): raise ValueError("y contains values > 1.0") @@ -47,14 +47,14 @@ def encode_y(y, inequalities=None): '<': 2, '>': 4, }).values - if offsets.isnan().any(): + if isnan(offsets).any(): raise ValueError("Invalid inequality. Must be =, <, or >") encoded = y + offsets - assert not encoded.isnan().any() + assert not isnan(encoded).any() return encoded -def mse_with_ineqalities(y_true, y_pred): +def mse_with_inequalities(y_true, y_pred): # Handle (=) inequalities diff1 = y_pred - y_true diff1 *= K.cast(y_true >= 0.0, "float32") @@ -75,4 +75,4 @@ def mse_with_ineqalities(y_true, y_pred): K.sum(K.square(diff1), axis=-1) + K.sum(K.square(diff2), axis=-1) + K.sum(K.square(diff3), axis=-1)) -LOSSES["mse_with_ineqalities"] = mse_with_ineqalities \ No newline at end of file +LOSSES["mse_with_inequalities"] = mse_with_inequalities \ No newline at end of file