Skip to content
Snippets Groups Projects
Commit 289dbb09 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Fix bug in inequality support, bump to version 1.1.0, add test

parent 73ddb9d2
No related merge requests found
from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
from mhcflurry.class1_neural_network import Class1NeuralNetwork
__version__ = "1.0.0"
__version__ = "1.1.0"
__all__ = [
"__version__",
......
......@@ -11,7 +11,7 @@ from .encodable_sequences import EncodableSequences
from .amino_acid import available_vector_encodings, vector_encoding_length
from .regression_target import to_ic50, from_ic50
from .common import random_peptides, amino_acid_distribution
from .loss_with_inequalities import encode_y, LOSSES
from .custom_loss import CUSTOM_LOSSES
class Class1NeuralNetwork(object):
......@@ -447,23 +447,29 @@ class Class1NeuralNetwork(object):
if self.hyperparameters['loss'].startswith("custom:"):
# Using a custom loss that supports inequalities
try:
loss_name_or_function = LOSSES[
custom_loss = CUSTOM_LOSSES[
self.hyperparameters['loss'].replace("custom:", "")
]
except KeyError:
raise ValueError(
"No such custom loss function: %s. Supported losses are: %s" % (
self.hyperparameters['loss'],
", ".join(["custom:" + loss_name for loss_name in LOSSES])))
loss_supports_inequalities = True
", ".join([
"custom:" + loss_name for loss_name in CUSTOM_LOSSES
])))
loss_name_or_function = custom_loss.loss
loss_supports_inequalities = custom_loss.supports_inequalities
loss_encode_y_function = custom_loss.encode_y
else:
# Using a regular keras loss. No inequalities supported.
loss_name_or_function = self.hyperparameters['loss']
loss_supports_inequalities = False
loss_encode_y_function = None
if any(inequality != "=" for inequality in adjusted_inequalities):
raise ValueError("Loss %s does not support inequalities" % (
loss_name_or_function))
if not loss_supports_inequalities and (
any(inequality != "=" for inequality in adjusted_inequalities)):
raise ValueError("Loss %s does not support inequalities" % (
loss_name_or_function))
if self.network() is None:
self._network = self.make_network(
......@@ -512,8 +518,8 @@ class Class1NeuralNetwork(object):
else:
sample_weights_with_random_negatives = None
if loss_supports_inequalities:
y_dict_with_random_negatives['output'] = encode_y(
if loss_encode_y_function is not None:
y_dict_with_random_negatives['output'] = loss_encode_y_function(
y_dict_with_random_negatives['output'],
adjusted_inequalities_with_random_negatives)
......
"""
Custom loss functions.
For losses supporting inequalities, each training data point is associated with
one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only
if the prediction is less than the given value.
"""
import pandas
from numpy import isnan, array
CUSTOM_LOSSES = {}
class MSEWithInequalities(object):
"""
Supports training a regressor on data that includes inequalities
(e.g. x < 100). Mean square error is used as the loss for elements with
an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss
for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise.
This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
hack, the implementation uses other intervals for y_pred to encode the
inequality information.
y_true is interpreted as follows:
between 0 - 1
Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
"""
name = "mse_with_inequalities"
supports_inequalities = True
@staticmethod
def encode_y(y, inequalities=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN")
if (y > 1.0).any():
raise ValueError("y contains values > 1.0")
if (y < 0.0).any():
raise ValueError("y contains values < 0.0")
if inequalities is None:
encoded = y
else:
offsets = pandas.Series(inequalities).map({
'=': 0,
'>': 2,
'<': 4,
}).values
if isnan(offsets).any():
raise ValueError("Invalid inequality. Must be =, <, or >")
encoded = y + offsets
assert not isnan(encoded).any()
return encoded
@staticmethod
def loss(y_true, y_pred):
# We always delay import of Keras so that mhcflurry can be imported initially
# without tensorflow debug output, etc.
from keras import backend as K
# Handle (=) inequalities
diff1 = y_pred - y_true
diff1 *= K.cast(y_true >= 0.0, "float32")
diff1 *= K.cast(y_true <= 1.0, "float32")
# Handle (>) inequalities
diff2 = y_pred - (y_true - 2.0)
diff2 *= K.cast(y_true >= 2.0, "float32")
diff2 *= K.cast(y_true <= 3.0, "float32")
diff2 *= K.cast(diff2 < 0.0, "float32")
# Handle (<) inequalities
diff3 = y_pred - (y_true - 4.0)
diff3 *= K.cast(y_true >= 4.0, "float32")
diff3 *= K.cast(diff3 > 0.0, "float32")
return (
K.sum(K.square(diff1), axis=-1) +
K.sum(K.square(diff2), axis=-1) +
K.sum(K.square(diff3), axis=-1))
# Register custom losses.
for cls in [MSEWithInequalities]:
CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
......@@ -8,7 +8,7 @@
# by name, the downloads with "default=true" are downloaded.
# This should usually be the latest release.
current-release: 1.0.0
current-release: 1.1.0
# An integer indicating what models the current MHCflurry code base is compatible
# with. Increment this integer when changes are made to MHCflurry that would break
......@@ -17,6 +17,33 @@ current-compatibility-version: 2
# Add new releases here as they are made.
releases:
1.1.0:
compatibility-version: 2
downloads:
- name: models_class1
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2
default: true
- name: models_class1_experiments1
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2
default: false
- name: cross_validation_class1
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2
default: false
- name: data_iedb
url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2
default: false
- name: data_kim2014
url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2
default: false
- name: data_curated
url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2
default: true
1.0.0:
compatibility-version: 2
downloads:
......
"""
Custom loss functions.
Supports training a regressor on data that includes inequalities (e.g. x < 100).
This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
hack, the implementation uses other intervals for y_pred to encode the
inequality information.
y_true is interpreted as follows:
between 0 - 1
Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
"""
from keras import backend as K
import pandas
from numpy import isnan, array
LOSSES = {}
def encode_y(y, inequalities=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN")
if (y > 1.0).any():
raise ValueError("y contains values > 1.0")
if (y < 0.0).any():
raise ValueError("y contains values < 0.0")
if inequalities is None:
encoded = y
else:
offsets = pandas.Series(inequalities).map({
'=': 0,
'<': 2,
'>': 4,
}).values
if isnan(offsets).any():
raise ValueError("Invalid inequality. Must be =, <, or >")
encoded = y + offsets
assert not isnan(encoded).any()
return encoded
def mse_with_inequalities(y_true, y_pred):
# Handle (=) inequalities
diff1 = y_pred - y_true
diff1 *= K.cast(y_true >= 0.0, "float32")
diff1 *= K.cast(y_true <= 1.0, "float32")
# Handle (>) inequalities
diff2 = y_pred - (y_true - 2.0)
diff2 *= K.cast(y_true >= 2.0, "float32")
diff2 *= K.cast(y_true <= 3.0, "float32")
diff2 *= K.cast(diff2 < 0.0, "float32")
# Handle (<) inequalities
diff3 = y_pred - (y_true - 4.0)
diff3 *= K.cast(y_true >= 4.0, "float32")
diff3 *= K.cast(diff3 > 0.0, "float32")
return (
K.sum(K.square(diff1), axis=-1) +
K.sum(K.square(diff2), axis=-1) +
K.sum(K.square(diff3), axis=-1))
LOSSES["mse_with_inequalities"] = mse_with_inequalities
\ No newline at end of file
from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal
import numpy
import pandas
numpy.random.seed(0)
from numpy import testing
from mhcflurry.class1_neural_network import Class1NeuralNetwork
numpy.random.seed(0)
from nose.tools import eq_
from numpy import testing
import logging
logging.getLogger('tensorflow').disabled = True
from mhcflurry.class1_neural_network import Class1NeuralNetwork
from mhcflurry.downloads import get_path
from mhcflurry.common import random_peptides
def test_class1_affinity_predictor_a0205_training_accuracy():
def test_class1_neural_network_a0205_training_accuracy():
# Memorize the dataset.
hyperparameters = dict(
activation="tanh",
......@@ -80,3 +83,106 @@ def test_class1_affinity_predictor_a0205_training_accuracy():
predictor2.fit(df.peptide.values, df.measurement_value.values)
eq_(predictor.network().to_json(), predictor2.network().to_json())
def test_inequalities():
# Memorize the dataset.
hyperparameters = dict(
loss="custom:mse_with_inequalities",
activation="tanh",
layer_sizes=[16],
max_epochs=50,
early_stopping=False,
validation_split=0.0,
locally_connected_layers=[
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
dense_layer_l1_regularization=0.0,
dropout_probability=0.0)
df = pandas.DataFrame()
df["peptide"] = random_peptides(1000, length=9)
# First half are binders
df["binder"] = df.index < len(df) / 2
df["value"] = df.binder.map({True: 100, False: 5000})
df.loc[:10, "value"] = 1.0 # some strong binders
df["inequality1"] = "="
df["inequality2"] = df.binder.map({True: "<", False: "="})
df["inequality3"] = df.binder.map({True: "=", False: ">"})
# "A" at start of peptide indicates strong binder
df["peptide"] = [
("C" if not row.binder else "A") + row.peptide[1:]
for _, row in df.iterrows()
]
fit_kwargs = {'verbose': 0}
# Prediction1 uses no inequalities (i.e. all are (=))
predictor = Class1NeuralNetwork(**hyperparameters)
predictor.fit(
df.peptide.values,
df.value.values,
inequalities=df.inequality1.values,
**fit_kwargs)
df["prediction1"] = predictor.predict(df.peptide.values)
# Prediction2 has a (<) inequality on binders and an (=) on non-binders
predictor = Class1NeuralNetwork(**hyperparameters)
predictor.fit(
df.peptide.values,
df.value.values,
inequalities=df.inequality2.values,
**fit_kwargs)
df["prediction2"] = predictor.predict(df.peptide.values)
# Prediction3 has a (=) inequality on binders and an (>) on non-binders
predictor = Class1NeuralNetwork(**hyperparameters)
predictor.fit(
df.peptide.values,
df.value.values,
inequalities=df.inequality3.values,
**fit_kwargs)
df["prediction3"] = predictor.predict(df.peptide.values)
df_binders = df.loc[df.binder]
df_nonbinders = df.loc[~df.binder]
print("***** Binders: *****")
print(df_binders.head(5))
print("***** Non-binders: *****")
print(df_nonbinders.head(5))
# Binders should always be given tighter predicted affinity than non-binders
assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean())
assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean())
assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean())
# prediction2 binders should be tighter on average than prediction1
# binders, since prediction2 has a (<) inequality for binders.
# Non-binders should be about the same between prediction2 and prediction1
assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean())
assert_almost_equal(
df_nonbinders.prediction2.mean(),
df_nonbinders.prediction1.mean(),
delta=2000)
# prediction3 non-binders should be weaker on average than prediction2 (or 1)
# non-binders, since prediction3 has a (>) inequality for these peptides.
# Binders should be about the same.
assert_greater(
df_nonbinders.prediction3.mean(),
df_nonbinders.prediction2.mean())
assert_greater(
df_nonbinders.prediction3.mean(),
df_nonbinders.prediction1.mean())
assert_almost_equal(
df_binders.prediction3.mean(),
df_binders.prediction1.mean(),
delta=2000)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment