Skip to content
Snippets Groups Projects
Commit dfa76ecc authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent f935fde8
No related merge requests found
......@@ -4,9 +4,6 @@ Train single allele models
"""
import sys
import argparse
import json
import os
import pickle
import pandas
......@@ -37,13 +34,20 @@ parser.add_argument(
required=True,
help="Result file")
QUALITATIVE_TO_AFFINITY = {
"Negative": 50000.0,
"Positive": 100.0,
"Positive-High": 50.0,
"Positive-Intermediate": 500.0,
"Positive-Low": 5000.0,
QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = {
"Negative": (20000.0, ">"),
"Positive": (500.0, "<"),
"Positive-High": (100.0, "<"),
"Positive-Intermediate": (1000.0, "<"),
"Positive-Low": (5000.0, "<"),
}
QUALITATIVE_TO_AFFINITY = dict(
(key, value[0]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
QUALITATIVE_TO_INEQUALITY = dict(
(key, value[1]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
EXCLUDE_IEDB_ALLELES = [
"HLA class I",
......@@ -60,6 +64,7 @@ def load_data_kim2014(filename):
True: "quantitative",
False: "qualitative",
})
df["measurement_inequality"] = df.inequality
df["original_allele"] = df.mhc
df["peptide"] = df.sequence
df["allele"] = df.mhc.map(normalize_allele_name)
......@@ -99,24 +104,28 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
quantitative = iedb_df.ix[iedb_df["Units"] == "nM"].copy()
quantitative["measurement_type"] = "quantitative"
quantitative["measurement_inequality"] = "="
print("Quantitative measurements: %d" % len(quantitative))
qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy()
qualitative["measurement_type"] = "qualitative"
print("Qualitative measurements: %d" % len(qualitative))
non_mass_spec_qualitative = qualitative.ix[
(~qualitative["Method/Technique"].str.contains("mass spec"))
].copy()
non_mass_spec_qualitative["Quantitative measurement"] = (
non_mass_spec_qualitative["Qualitative Measure"].map(
QUALITATIVE_TO_AFFINITY))
print("Qualitative measurements after dropping MS: %d" % (
len(non_mass_spec_qualitative)))
#qualitative = qualitative.ix[
# (~qualitative["Method/Technique"].str.contains("mass spec"))
#].copy()
qualitative["Quantitative measurement"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY))
qualitative["measurement_inequality"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_INEQUALITY))
print("Qualitative measurements (possibly after dropping MS): %d" % (
len(qualitative)))
iedb_df = pandas.concat(
(
([quantitative]) +
([non_mass_spec_qualitative] if include_qualitative else [])),
([qualitative] if include_qualitative else [])),
ignore_index=True)
print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts())
......@@ -145,6 +154,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
"Quantitative measurement"
].values
train_data["measurement_source"] = iedb_df.category.values
train_data["measurement_inequality"] = iedb_df.measurement_inequality.values
train_data["allele"] = iedb_df["allele"].values
train_data["original_allele"] = iedb_df["Allele Name"].values
......@@ -181,6 +191,7 @@ def run():
"allele",
"peptide",
"measurement_value",
"measurement_inequality",
"measurement_type",
"measurement_source",
"original_allele",
......
......@@ -14,12 +14,12 @@ import pandas
from numpy.testing import assert_equal
from six import string_types
from mhcflurry.class1_neural_network import Class1NeuralNetwork
from mhcflurry.common import random_peptides
from mhcflurry.downloads import get_path
from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.percent_rank_transform import PercentRankTransform
from mhcflurry.regression_target import to_ic50
from .class1_neural_network import Class1NeuralNetwork
from .common import random_peptides
from .downloads import get_path
from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform
from .regression_target import to_ic50
class Class1AffinityPredictor(object):
......@@ -368,7 +368,7 @@ class Class1AffinityPredictor(object):
affinities,
inequalities=None,
models_dir_for_save=None,
verbose=1,
verbose=0,
progress_preamble=""):
"""
Fit one or more allele specific predictors for a single allele using a
......@@ -425,7 +425,7 @@ class Class1AffinityPredictor(object):
if n_architectures > 1:
pieces.append(
"Architecture {architecture_num:2d} / {n_architectures:2d}"
" (best so far: {best_num:2d)")
" (best so far: {best_num})")
progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
", ".join(pieces))
else:
......@@ -450,13 +450,12 @@ class Class1AffinityPredictor(object):
verbose=verbose,
progress_preamble=progress_preamble_template.format(
user_progress_preamble=progress_preamble,
best_num=best_num,
model_num=model_num,
best_num="n/a" if best_num is None else best_num + 1,
model_num=model_num + 1,
n_models=n_models,
architecture_num=architecture_num,
architecture_num=architecture_num + 1,
n_architectures=n_architectures))
if n_architectures > 1:
# We require val_loss (i.e. a validation set) if we have
# multiple architectures.
......@@ -464,11 +463,14 @@ class Class1AffinityPredictor(object):
else:
loss = None
if loss is None or best_loss is None or best_loss > loss:
best_loss = best_loss
best_loss = loss
best_num = architecture_num
best_model = model
del model
if n_architectures > 1:
print("Selected architecture %d." % (best_num + 1))
model_name = self.model_name(allele, model_num)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
......
......@@ -444,9 +444,17 @@ class Class1NeuralNetwork(object):
if sample_weights is not None:
sample_weights = sample_weights[shuffle_permutation]
if self.hyperparameters['loss'] in LOSSES:
if self.hyperparameters['loss'].startswith("custom:"):
# Using a custom loss that supports inequalities
loss_name_or_function = LOSSES[self.hyperparameters['loss']]
try:
loss_name_or_function = LOSSES[
self.hyperparameters['loss'].replace("custom:", "")
]
except KeyError:
raise ValueError(
"No such custom loss function: %s. Supported losses are: %s" % (
self.hyperparameters['loss'],
", ".join(["custom:" + loss_name for loss_name in LOSSES])))
loss_supports_inequalities = True
else:
# Using a regular keras loss. No inequalities supported.
......
......@@ -25,14 +25,14 @@ between 4 - 5:
from keras import backend as K
import pandas
import numpy
from numpy import isnan, array
LOSSES = {}
def encode_y(y, inequalities=None):
y = numpy.array(y, dtype="float32")
if y.isnan().any():
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN")
if (y > 1.0).any():
raise ValueError("y contains values > 1.0")
......@@ -47,14 +47,14 @@ def encode_y(y, inequalities=None):
'<': 2,
'>': 4,
}).values
if offsets.isnan().any():
if isnan(offsets).any():
raise ValueError("Invalid inequality. Must be =, <, or >")
encoded = y + offsets
assert not encoded.isnan().any()
assert not isnan(encoded).any()
return encoded
def mse_with_ineqalities(y_true, y_pred):
def mse_with_inequalities(y_true, y_pred):
# Handle (=) inequalities
diff1 = y_pred - y_true
diff1 *= K.cast(y_true >= 0.0, "float32")
......@@ -75,4 +75,4 @@ def mse_with_ineqalities(y_true, y_pred):
K.sum(K.square(diff1), axis=-1) +
K.sum(K.square(diff2), axis=-1) +
K.sum(K.square(diff3), axis=-1))
LOSSES["mse_with_ineqalities"] = mse_with_ineqalities
\ No newline at end of file
LOSSES["mse_with_inequalities"] = mse_with_inequalities
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment