updates

a2cebd98 · Tim O'Donnell · 2e8dc426 · a2cebd98 · a2cebd98 · a2cebd98
Commit a2cebd98 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -16,7 +16,7 @@ class AlleleEncoding(object):
        integer indices in a consistent way by keeping track of the universe
        of alleles under use, i.e. a distinction is made between the universe
        of supported alleles (what's in `allele_to_sequence`) and the actual
-        set of alleles used (what's in `alleles`).
+        set of alleles used for some task (what's in `alleles`).

        Parameters
        ----------
@@ -45,11 +45,12 @@ class AlleleEncoding(object):
                sorted(allele_to_sequence))
            self.allele_to_index = dict(
                (allele, i)
-                for (i, allele) in enumerate(all_alleles))
-            self.allele_to_index[None] = -1  # special mask value
-            unpadded = pandas.Series(
-                [allele_to_sequence[a] for a in all_alleles],
-                index=all_alleles)
+                for (i, allele) in enumerate([None] + all_alleles))
+            unpadded = pandas.Series([
+                    allele_to_sequence[a] if a is not None else ""
+                    for a in [None] + all_alleles
+                ],
+                index=[None] + all_alleles)
            self.sequences = unpadded.str.pad(
                unpadded.str.len().max(), fillchar="X")
        else:
@@ -85,7 +86,8 @@ class AlleleEncoding(object):
            alleles=self.alleles,
            allele_to_sequence=dict(
                (allele, self.allele_to_sequence[allele])
-                for allele in self.alleles.unique()))
+                for allele in self.alleles.unique()
+                if allele is not None))

    def allele_representations(self, encoding_name):
        """
@@ -173,6 +175,20 @@ class MultipleAlleleEncoding(object):
        )
        self.max_alleles_per_experiment = max_alleles_per_experiment

+    def append_alleles(self, alleles):
+        extended_alleles = list(self.allele_encoding.alleles)
+        for allele in alleles:
+            extended_alleles.append(allele)
+            extended_alleles.extend(
+                [None] * (self.max_alleles_per_experiment - 1))
+
+        assert len(extended_alleles) % self.max_alleles_per_experiment == 0, (
+            len(extended_alleles))
+
+        self.allele_encoding = AlleleEncoding(
+            alleles=extended_alleles,
+            borrow_from=self.allele_encoding)
+
    @property
    def indices(self):
        return self.allele_encoding.indices.values.reshape(

--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -109,6 +109,7 @@ class Class1LigandomePredictor(object):
            RepeatVector,
            concatenate,
            Reshape,
+            Lambda,
            Embedding)
        from keras.models import Model

@@ -134,9 +135,11 @@ class Class1LigandomePredictor(object):
            input_dim=64,  # arbitrary, how many alleles to have room for
            output_dim=1029,
            input_length=6,
-            trainable=False)(input_alleles)
+            trainable=False,
+            mask_zero=True)(input_alleles)

-        allele_flat = Reshape((6, -1), name="allele_flat")(allele_representation)
+        #allele_flat = Reshape((6, -1), name="allele_flat")(allele_representation)
+        allele_flat = allele_representation

        allele_peptide_merged = concatenate(
            [peptides_repeated, allele_flat], name="allele_peptide_merged")
@@ -184,7 +187,7 @@ class Class1LigandomePredictor(object):

            layer_name_to_new_node[layer.name] = node

-        affinity_predictor_output = node
+        affinity_predictor_output = Lambda(lambda x: x[:, 0])(node)

        for (i, layer_size) in enumerate(
                hyperparameters['additional_dense_layers']):
@@ -198,6 +201,10 @@ class Class1LigandomePredictor(object):
        lifted = TimeDistributed(layer)
        ligandome_output = lifted(node)

+        #output_node = concatenate([
+        #    affinity_predictor_output, ligandome_output
+        #], name="combined_output")
+
        network = Model(
            inputs=[input_peptides, input_alleles],
            outputs=[affinity_predictor_output, ligandome_output],
@@ -281,31 +288,47 @@ class Class1LigandomePredictor(object):
        peptide_encoding = peptide_encoding[shuffle_permutation]
        allele_encoding_input = allele_encoding_input[shuffle_permutation]
        labels = labels[shuffle_permutation]
+        inequalities = inequalities[shuffle_permutation]
+        affinities_mask = affinities_mask[shuffle_permutation]

        x_dict = {
            'peptide': peptide_encoding,
            'allele': allele_encoding_input,
        }

-        loss = MSEWithInequalitiesAndMultipleOutputs(losses=[
-            MSEWithInequalities(),
-            MultiallelicMassSpecLoss(
-                delta=self.hyperparameters['loss_delta']),
-        ])
-
-        y_values_pre_encoding = labels.copy()
+        y1 = numpy.zeros(shape=len(labels))
        if affinities_mask is not None:
-            y_values_pre_encoding[affinities_mask] = from_ic50(labels)
-        y_values = loss.encode_y(
-            y_values_pre_encoding,
-            inequalities=inequalities[affinities_mask] if inequalities is not None else None,
-            output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int))
+            y1[affinities_mask] = from_ic50(labels[affinities_mask])
+
+        if inequalities is not None:
+            # Reverse inequalities because from_ic50() flips the direction
+            # (i.e. lower affinity results in higher y values).
+            adjusted_inequalities = pandas.Series(inequalities).map({
+                "=": "=",
+                ">": "<",
+                "<": ">",
+            }).values
+        else:
+            adjusted_inequalities = numpy.tile("=", len(y1))
+
+        adjusted_inequalities[~affinities_mask] = ">"
+
+        affinities_loss = MSEWithInequalities()
+        encoded_y1 = affinities_loss.encode_y(
+            y1, inequalities=adjusted_inequalities)
+
+        mms_loss = MultiallelicMassSpecLoss(
+            delta=self.hyperparameters['loss_delta'])
+
+        y2 = labels.copy()
+        y2[affinities_mask] = -1
+        encoded_y2 = mms_loss.encode_y(y2)

        fit_info = collections.defaultdict(list)

        self.set_allele_representations(allele_representations)
        self.network.compile(
-            loss=loss.loss,
+            loss=[affinities_loss.loss, mms_loss.loss],
            optimizer=self.hyperparameters['optimizer'])
        if self.hyperparameters['learning_rate'] is not None:
            K.set_value(
@@ -328,7 +351,7 @@ class Class1LigandomePredictor(object):
            # to a single experiment
            fit_history = self.network.fit(
                x_dict,
-                labels,
+                [encoded_y1, encoded_y2],
                shuffle=True,
                batch_size=self.hyperparameters['minibatch_size'],
                verbose=verbose,

--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -182,10 +182,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
    supports_inequalities = True
    supports_multiple_outputs = True

-    def __init__(self, losses=MSEWithInequalities):
-        self.losses = losses
-
-    def encode_y(self, y, inequalities=None, output_indices=None):
+    @staticmethod
+    def encode_y(y, inequalities=None, output_indices=None):
        y = array(y, dtype="float32")
        if isnan(y).any():
            raise ValueError("y contains NaN", y)
@@ -194,25 +192,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
        if (y < 0.0).any():
            raise ValueError("y contains values < 0.0", y)

-        if isinstance(self.losses, Loss):
-            # Single loss applied to all outputs
-            encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities)
-        else:
-            assert output_indices is not None
-            df = pandas.DataFrame({
-                "y": y,
-                "inequality": inequalities,
-                "output_index": output_indices,
-            })
-            encoded = y.copy()
-            encoded[:] = numpy.nan
-            for (output_index, sub_df) in df.groupby("output_index"):
-                loss = self.losses[output_index]
-                loss_kwargs = {}
-                if not sub_df.inequality.isnull().all():
-                    loss_kwargs['inequalities'] = sub_df.inequality.values
-                encoded[sub_df.index.values] = loss.encode_y(
-                    sub_df.y.values, **loss_kwargs)
+        encoded = MSEWithInequalities.encode_y(
+            y, inequalities=inequalities)

        if output_indices is not None:
            output_indices = numpy.array(output_indices)
@@ -224,7 +205,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):

        return encoded

-    def loss(self, y_true, y_pred):
+    @staticmethod
+    def loss(y_true, y_pred):
        from keras import backend as K

        y_true = K.flatten(y_true)
@@ -248,25 +230,17 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
        # ], axis=-1)
        #updated_y_pred = tf.gather_nd(y_pred, indexer)

-        if isinstance(self.losses, Loss):
-            # Single loss for all outputs.
-            return self.losses.loss(updated_y_true, updated_y_pred)
-        else:
-            # TODO: make this more efficient?
-            result = None
-            for (i, loss) in enumerate(self.losses):
-                values = (
-                    loss.loss(updated_y_true, updated_y_pred) *
-                    K.cast(K.equal(output_indices, i), "float32"))
-                if result is None:
-                    result = values
-                else:
-                    result += values
-            return result
+        return MSEWithInequalities.loss(updated_y_true, updated_y_pred)


 class MultiallelicMassSpecLoss(Loss):
    """
+    Affiniities are mapped according to MSEWithInequalities, then by:
+        x -> x + 10.
+
+    Mass spec hit vs. decoy are kept at [0, 1].
+
+
    """
    name = "multiallelic_mass_spec_loss"
    supports_inequalities = True
@@ -275,24 +249,38 @@ class MultiallelicMassSpecLoss(Loss):
    def __init__(self, delta=0.2):
        self.delta = delta

-    def encode_y(self, y):
-        return y
+    @staticmethod
+    def encode_y(y, affinities_mask=None, inequalities=None):
+        y = array(y, dtype="float32")
+        if isnan(y).any():
+            raise ValueError("y contains NaN", y)
+        if (y > 1.0).any():
+            raise ValueError("y contains values > 1.0", y)
+        if (y < 0.0).any():
+            raise ValueError("y contains values < 0.0", y)
+
+        encoded = y.copy()
+        if affinities_mask is not None:
+            encoded[affinities_mask] = MSEWithInequalities.encode_y(
+                encoded[affinities_mask], inequalities=inequalities) + 10.0
+
+        return encoded

    def loss(self, y_true, y_pred):
        import tensorflow as tf

-        #y_pred = tf.squeeze(y_pred, axis=-1)
-        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
+        y_pred = tf.squeeze(y_pred, axis=-1)
+        y_true = tf.reshape(y_true, (-1,))

-        pos = tf.boolean_mask(y_pred, y_true)
+        pos = tf.boolean_mask(y_pred, tf.math.equal(y_true, 1.0))
        pos_max = tf.reduce_max(pos, axis=1)
-        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
-        result = tf.reduce_sum(
-            tf.maximum(
-                0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2)
+        neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
+        term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
+        result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
        return result


+
 def check_shape(name, arr, expected_shape):
    """
    Raise ValueError if arr.shape != expected_shape.
@@ -310,5 +298,5 @@ def check_shape(name, arr, expected_shape):


 # Register custom losses.
-for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
    CUSTOM_LOSSES[cls.name] = cls()
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -117,6 +117,7 @@ def test_loss():
            1.0,
            0.0,
            1.0,
+            -1.0,  # ignored
            1.0,
            0.0
        ]
@@ -125,8 +126,10 @@ def test_loss():
            [0.3, 0.7, 0.5],
            [0.2, 0.4, 0.6],
            [0.1, 0.5, 0.3],
+            [0.9, 0.1, 0.2],
            [0.1, 0.7, 0.1],
            [0.8, 0.2, 0.4],
+
        ]
        y_pred = numpy.array(y_pred)

@@ -159,7 +162,7 @@ def test_loss():
            if y_true[i] == 1.0
        ])

-        neg = y_pred[~y_true.astype(bool)]
+        neg = y_pred[(y_true == 0.0).astype(bool)]
        expected2 = (
                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()

@@ -223,7 +226,112 @@ def make_motif(allele, peptides, frac=0.01):
    return matrix


-def test_synthetic_allele_refinement(max_epochs=10):
+def test_real_data_multiallelic_refinement(max_epochs=10):
+    ms_df = pandas.read_csv(
+        get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
+    ms_df = ms_df.loc[
+        (ms_df.mhc_class == "I") & (~ms_df.protein_ensembl.isnull())].copy()
+
+    sample_table = ms_df.drop_duplicates(
+        "sample_id").set_index("sample_id").loc[ms_df.sample_id.unique()]
+    grouped = ms_df.groupby("sample_id").nunique()
+    for col in sample_table.columns:
+        if (grouped[col] > 1).any():
+            del sample_table[col]
+    sample_table["alleles"] = sample_table.hla.str.split()
+
+    multi_train_df = ms_df.loc[
+        ms_df.sample_id  == "RA957"
+    ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
+    multi_train_df["label"] = 1.0
+    multi_train_df["is_affinity"] = False
+
+    multi_train_alleles = set()
+    for alleles in sample_table.loc[multi_train_df.sample_id.unique()].alleles:
+        multi_train_alleles.update(alleles)
+    multi_train_alleles = sorted(multi_train_alleles)
+
+    pan_train_df = pandas.read_csv(
+        get_path(
+            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
+
+    pan_sub_train_df = pan_train_df.loc[
+        pan_train_df.allele.isin(multi_train_alleles),
+        ["peptide", "allele", "measurement_inequality", "measurement_value"]
+    ]
+    pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
+    del pan_sub_train_df["measurement_value"]
+    pan_sub_train_df["is_affinity"] = True
+
+    pan_predictor = Class1AffinityPredictor.load(
+        get_path("models_class1_pan", "models.with_mass_spec"),
+        optimization_level=0,
+        max_models=1)
+
+    allele_encoding = MultipleAlleleEncoding(
+        experiment_names=multi_train_df.sample_id.values,
+        experiment_to_allele_list=sample_table.alleles.to_dict(),
+        max_alleles_per_experiment=sample_table.alleles.str.len().max(),
+        allele_to_sequence=pan_predictor.allele_to_sequence,
+    )
+    allele_encoding.append_alleles(pan_sub_train_df.allele.values)
+    allele_encoding =  allele_encoding.compact()
+
+    combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df])
+
+    ligandome_predictor = Class1LigandomePredictor(
+        pan_predictor,
+        max_ensemble_size=1,
+        max_epochs=500,
+        learning_rate=0.0001,
+        patience=5,
+        min_delta=0.0)
+
+    pre_predictions = from_ic50(
+        ligandome_predictor.predict(
+            output="affinities",
+            peptides=combined_train_df.peptide.values,
+            allele_encoding=allele_encoding))
+
+    (model,) = pan_predictor.class1_pan_allele_models
+    expected_pre_predictions = from_ic50(
+        model.predict(
+            peptides=numpy.repeat(combined_train_df.peptide.values, len(alleles)),
+            allele_encoding=allele_encoding.allele_encoding,
+    )).reshape((-1, len(alleles)))[:,0]
+
+    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
+
+    motifs_history = []
+    random_peptides_encodable = make_random_peptides(10000, [9])
+
+
+    def update_motifs():
+        for allele in multi_train_alleles:
+            motif = make_motif(allele, random_peptides_encodable)
+            motifs_history.append((allele, motif))
+
+    print("Pre fitting:")
+    update_motifs()
+    print("Fitting...")
+
+    ligandome_predictor.fit(
+        peptides=combined_train_df.peptide.values,
+        labels=combined_train_df.label.values,
+        allele_encoding=allele_encoding,
+        affinities_mask=combined_train_df.is_affinity.values,
+        inequalities=combined_train_df.measurement_inequality.values,
+        progress_callback=update_motifs,
+    )
+
+    #import ipdb ; ipdb.set_trace()
+
+
+
+
+
+
+def Xtest_synthetic_allele_refinement(max_epochs=10):
    refine_allele = "HLA-C*01:02"
    alleles = [
        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",