From e13848536566e8a0bd7ed1f154762dc0c299165e Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 12 Jan 2020 16:48:16 -0500
Subject: [PATCH] First cut on cleavage predictor

---
 mhcflurry/class1_cleavage_neural_network.py | 443 ++++++++++++++++++++
 mhcflurry/encodable_sequences.py            |  41 ++
 test/test_class1_cleavage_neural_network.py |  65 +++
 3 files changed, 549 insertions(+)
 create mode 100644 mhcflurry/class1_cleavage_neural_network.py
 create mode 100644 test/test_class1_cleavage_neural_network.py

diff --git a/mhcflurry/class1_cleavage_neural_network.py b/mhcflurry/class1_cleavage_neural_network.py
new file mode 100644
index 00000000..a6f2077d
--- /dev/null
+++ b/mhcflurry/class1_cleavage_neural_network.py
@@ -0,0 +1,443 @@
+"""
+Idea:
+
+Fully convolutional network with softmax output. Let it take a 35mer:
+- N flank [10 aa]
+- peptide [7-15 aa]
+- C flank [10 aa]
+
+Train on monoallelic mass spec. Match positive examples (hits) to negatives
+from the same sample by finding unobserved peptides with as close as possible
+a match for predicted binding affinity.
+
+In final layer, take max cleavage score over peptide and the individual score
+for the position right before the peptide terminus. Compute the ratio of these.
+Or actually reverse of that. Hits get label 1, decoys get 0.
+
+For a hit with sequence
+
+NNNNNNNNNNPPPPPPPPPCCCCCCCCCC
+
+penalize on:
+
+[----------1000000000---------]
+
+For a decoy with same sequence, penalize it on:
+
+[----------0-----------------]
+
+Train separate models for N- and C-terminal cleavage.
+
+Issue:
+- it'll learn mass spec biases in the peptide
+
+Possible fix:
+- Also provide the amino acid counts of the peptide as auxiliary inputs. After
+training, set the cysteine value to 0.
+
+Architecture:
+architecture (for N terminal: for C terminal reverse the sequences):
+
+input of length S=25 [flank + left-aligned peptide]
+*** conv [vector of length S] ***
+*** [more convs and local pools] ***
+*** output [vector of length S] ***
+*** extract: position 10 and max of peptide positions [2-vector]
+*** concat:[position 10, max of peptide positions, number of Alananine, ..., number of Y in peptide]
+*** single dense node, softmax activation [1-vector]
+
+Train on monoallelic.
+
+Decoys are length-matched to hits and sampled from the same transcripts, selecting
+an unobeserved peptide with as close as possible the same predicted affinity.
+
+
+    *** + repeat vector for each position
+
+
+*** conv ***
+*** conv ***
+*** ... conv n ***
+***                             repeat vector for each position
+*** dense per-position
+*** output [35-vector]
+*** extract: position 10 and max of peptide positions [2-vector]
+*** dense
+*** output
+
+
+IDEA 2:
+
+- Two inputs: N-flank + peptide (left aligned), peptide (right alighted +  C-flank
+- Bunch of convolutions
+- Global max pooling
+- Dense
+
+
+"""
+
+from __future__ import print_function
+
+import time
+import collections
+from six import string_types
+
+import numpy
+import pandas
+import mhcnames
+import hashlib
+
+from .hyperparameters import HyperparameterDefaults
+from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
+from .encodable_sequences import EncodableSequences
+from .regression_target import from_ic50, to_ic50
+from .random_negative_peptides import RandomNegativePeptides
+from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
+from .auxiliary_input import AuxiliaryInputEncoder
+from .batch_generator import BatchGenerator
+from .custom_loss import (
+    MSEWithInequalities,
+    TransformPredictionsLossWrapper,
+    MultiallelicMassSpecLoss)
+
+
+
+
+class Class1CleavageNeuralNetwork(object):
+    network_hyperparameter_defaults = HyperparameterDefaults(
+        amino_acid_encoding="BLOSUM62",
+        peptide_max_length=15,
+        n_flank_length=10,
+        c_flank_length=10,
+        vector_encoding_name="BLOSUM62",
+    )
+    """
+    Hyperparameters (and their default values) that affect the neural network
+    architecture.
+    """
+
+    fit_hyperparameter_defaults = HyperparameterDefaults(
+        max_epochs=500,
+        validation_split=0.1,
+        early_stopping=True
+    )
+    """
+    Hyperparameters for neural network training.
+    """
+
+    early_stopping_hyperparameter_defaults = HyperparameterDefaults(
+        patience=5,
+        min_delta=0.0,
+    )
+    """
+    Hyperparameters for early stopping.
+    """
+
+    compile_hyperparameter_defaults = HyperparameterDefaults(
+        optimizer="rmsprop",
+        learning_rate=None,
+    )
+    """
+    Loss and optimizer hyperparameters. Any values supported by keras may be
+    used.
+    """
+
+    auxiliary_input_hyperparameter_defaults = HyperparameterDefaults(
+    )
+    """
+    Allele feature hyperparameters.
+    """
+
+    hyperparameter_defaults = network_hyperparameter_defaults.extend(
+        fit_hyperparameter_defaults).extend(
+        early_stopping_hyperparameter_defaults).extend(
+        compile_hyperparameter_defaults).extend(
+        auxiliary_input_hyperparameter_defaults)
+
+    def __init__(self, **hyperparameters):
+        self.hyperparameters = self.hyperparameter_defaults.with_defaults(
+            hyperparameters)
+        self.network = None
+        self.fit_info = []
+
+    def fit(
+            self,
+            peptides,
+            n_flanks,
+            c_flanks,
+            targets,
+            sample_weights=None,
+            shuffle_permutation=None,
+            verbose=1,
+            progress_callback=None,
+            progress_preamble="",
+            progress_print_interval=5.0):
+        """
+
+
+        Parameters
+        ----------
+        peptides
+        n_flanks
+        c_flanks
+        targets : array of {0, 1} indicating hits (1) or decoys (0)
+
+        Returns
+        -------
+
+        """
+        import keras.backend as K
+
+        peptides = EncodableSequences.create(peptides)
+        n_flanks = EncodableSequences.create(n_flanks)
+        c_flanks = EncodableSequences.create(c_flanks)
+
+        x_list = self.peptides_and_flanking_to_network_input(
+            peptides, n_flanks, c_flanks)
+
+        # Shuffle
+        if shuffle_permutation is None:
+            shuffle_permutation = numpy.random.permutation(len(targets))
+        targets = targets[shuffle_permutation]
+        assert numpy.isnan(targets).sum() == 0, targets
+        if sample_weights is not None:
+            sample_weights = numpy.array(sample_weights)[shuffle_permutation]
+        x_list = [x[shuffle_permutation] for x in x_list]
+
+        fit_info = collections.defaultdict(list)
+
+        if self.network is None:
+            self.network = self.make_network(
+                **self.network_hyperparameter_defaults.subselect(
+                    self.hyperparameters))
+            if verbose > 0:
+                self.network.summary()
+
+        self.network.compile(
+            loss="binary_crossentropy",
+            optimizer=self.hyperparameters['optimizer'])
+
+        last_progress_print = None
+        min_val_loss_iteration = None
+        min_val_loss = None
+        start = time.time()
+        for i in range(self.hyperparameters['max_epochs']):
+            epoch_start = time.time()
+            fit_history = self.network.fit(
+                x_list,
+                targets,
+                validation_split=self.hyperparameters['validation_split'],
+                shuffle=True,
+                epochs=i + 1,
+                sample_weight=sample_weights,
+                initial_epoch=i,
+                verbose=verbose)
+            epoch_time = time.time() - epoch_start
+
+            for (key, value) in fit_history.history.items():
+                fit_info[key].extend(value)
+
+            # Print progress no more often than once every few seconds.
+            if progress_print_interval is not None and (
+                    not last_progress_print or (
+                        time.time() - last_progress_print
+                        > progress_print_interval)):
+                print((progress_preamble + " " +
+                       "Epoch %3d / %3d [%0.2f sec]: loss=%g. "
+                       "Min val loss (%s) at epoch %s" % (
+                           i,
+                           self.hyperparameters['max_epochs'],
+                           epoch_time,
+                           fit_info['loss'][-1],
+                           str(min_val_loss),
+                           min_val_loss_iteration)).strip())
+                last_progress_print = time.time()
+
+            if self.hyperparameters['validation_split']:
+                val_loss = fit_info['val_loss'][-1]
+
+                if min_val_loss is None or (
+                        val_loss < min_val_loss - self.hyperparameters['min_delta']):
+                    min_val_loss = val_loss
+                    min_val_loss_iteration = i
+
+                if self.hyperparameters['early_stopping']:
+                    threshold = (
+                        min_val_loss_iteration +
+                        self.hyperparameters['patience'])
+                    if i > threshold:
+                        if progress_print_interval is not None:
+                            print((progress_preamble + " " +
+                                "Stopping at epoch %3d / %3d: loss=%g. "
+                                "Min val loss (%g) at epoch %s" % (
+                                    i,
+                                    self.hyperparameters['max_epochs'],
+                                    fit_info['loss'][-1],
+                                    (
+                                        min_val_loss if min_val_loss is not None
+                                        else numpy.nan),
+                                    min_val_loss_iteration)).strip())
+                        break
+
+            if progress_callback:
+                progress_callback()
+
+        fit_info["time"] = time.time() - start
+        fit_info["num_points"] = len(peptides)
+        self.fit_info.append(dict(fit_info))
+
+    def predict(
+            self,
+            peptides,
+            n_flanks,
+            c_flanks,
+            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
+        """
+        """
+        x_list = self.peptides_and_flanking_to_network_input(
+            peptides, n_flanks, c_flanks)
+        raw_predictions = self.network.predict(
+            x_list, batch_size=batch_size)
+        predictions = numpy.array(raw_predictions, dtype="float64")
+        return predictions
+
+    def peptides_and_flanking_to_network_input(self, peptides, n_flanks, c_flanks):
+        """
+        Encode peptides to the fixed-length encoding expected by the neural
+        network (which depends on the architecture).
+
+        Parameters
+        ----------
+        peptides : EncodableSequences or list of string
+
+        Returns
+        -------
+        numpy.array
+        """
+        peptides = EncodableSequences.create(peptides)
+        n_flanks = EncodableSequences.create(n_flanks)
+        c_flanks = EncodableSequences.create(c_flanks)
+
+        peptide_encoded1 = peptides.variable_length_to_fixed_length_vector_encoding(
+            vector_encoding_name=self.hyperparameters['vector_encoding_name'],
+            max_length=self.hyperparameters['peptide_max_length'],
+            alignment_method='right_pad')
+        peptide_encoded2 = peptides.variable_length_to_fixed_length_vector_encoding(
+            vector_encoding_name=self.hyperparameters['vector_encoding_name'],
+            max_length=self.hyperparameters['peptide_max_length'],
+            alignment_method='left_pad')
+        n_flanks_encoded = n_flanks.variable_length_to_fixed_length_vector_encoding(
+            vector_encoding_name=self.hyperparameters['vector_encoding_name'],
+            max_length=self.hyperparameters['n_flank_length'],
+            alignment_method='right_pad')
+        c_flanks_encoded = c_flanks.variable_length_to_fixed_length_vector_encoding(
+            vector_encoding_name=self.hyperparameters['vector_encoding_name'],
+            max_length=self.hyperparameters['c_flank_length'],
+            alignment_method='left_pad')
+
+        return [
+            peptide_encoded1,
+            peptide_encoded2,
+            n_flanks_encoded,
+            c_flanks_encoded
+        ]
+
+
+    def make_network(
+            self,
+            amino_acid_encoding,
+            peptide_max_length,
+            n_flank_length,
+            c_flank_length,
+            vector_encoding_name):
+        """
+        Helper function to make a keras network
+        """
+
+        # We import keras here to avoid tensorflow debug output, etc. unless we
+        # are actually about to use Keras.
+
+        from keras.layers import Input
+        import keras.layers
+        import keras.layers.pooling
+        from keras.layers.core import Dense, Flatten, Dropout
+        from keras.layers.embeddings import Embedding
+        from keras.layers.normalization import BatchNormalization
+        from keras.layers.merge import Concatenate
+        import keras.backend as K
+
+        (peptides_empty, _, n_flanks_empty, c_flanks_empty) = (
+            self.peptides_and_flanking_to_network_input(
+                peptides=[],
+                n_flanks=[],
+                c_flanks=[]))
+
+        print((peptides_empty, _, n_flanks_empty, c_flanks_empty))
+
+        #import ipdb ; ipdb.set_trace()
+
+        peptide_input1 = Input(
+            shape=peptides_empty.shape[1:],
+            dtype='float32',
+            name='peptide1')
+        peptide_input2 = Input(
+            shape=peptides_empty.shape[1:],
+            dtype='float32',
+            name='peptide2')
+        n_flank_input = Input(
+            shape=n_flanks_empty.shape[1:],
+            dtype='float32',
+            name='n_flank')
+        c_flank_input = Input(
+            shape=c_flanks_empty.shape[1:],
+            dtype='float32',
+            name='c_flank')
+
+        inputs = [peptide_input1, peptide_input2, n_flank_input, c_flank_input]
+
+        sub_networks = []
+        for input_pair in [(n_flank_input, peptide_input1), (peptide_input2, c_flank_input)]:
+            # need to stack them together
+            current_layer = Concatenate(axis=1)(list(input_pair))
+            for i in range(1):
+                current_layer = keras.layers.Conv1D(
+                    filters=int(16 / 2**i),
+                    kernel_size=8,
+                    activation="tanh")(current_layer)
+                #current_layer = keras.layers.pooling.MaxPooling1D(
+                #    pool_size=4)(current_layer)
+            #current_layer = Flatten()(current_layer)
+            sub_networks.append(current_layer)
+
+        extracted_layers = []
+        extracted_layers.append(
+            keras.layers.Lambda(lambda x: x[:, n_flank_length])(sub_networks[0]))
+        peptide_n_cleavage = keras.layers.Lambda(
+            lambda x: x[
+                :, (n_flank_length + 1) :
+            ])(sub_networks[0])
+        extracted_layers.append(
+            keras.layers.pooling.GlobalMaxPooling1D()(peptide_n_cleavage))
+        extracted_layers.append(
+            keras.layers.Lambda(
+                lambda x: x[:, peptide_max_length])(sub_networks[1]))
+        peptide_c_cleavage = keras.layers.Lambda(
+            lambda x: x[
+                :, 0 : peptide_max_length
+            ])(sub_networks[1])
+        extracted_layers.append(
+            keras.layers.pooling.GlobalMaxPooling1D()(peptide_c_cleavage))
+
+        current_layer = Concatenate()(extracted_layers)
+
+        output = Dense(
+            1,
+            activation="sigmoid",
+            name="output")(current_layer)
+        model = keras.models.Model(
+            inputs=inputs,
+            outputs=[output],
+            name="predictor")
+
+        return model
+
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index 19696e23..9cd04e0d 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -382,7 +382,48 @@ class EncodableSequences(object):
                     sub_df.index,
                     center_left_offset : center_left_offset + length
                 ] = fixed_length_sequences
+        elif alignment_method in ("right_pad", "left_pad"):
+            # We arbitrarily set a minimum length of 5, although this encoding
+            # could handle smaller peptides.
+            min_length = 5
+
+            # Result array is int32, filled with X (null amino acid) value.
+            result = numpy.full(
+                fill_value=amino_acid.AMINO_ACID_INDEX['X'],
+                shape=(len(sequences), max_length),
+                dtype="int32")
+
+            df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
+
+            # For efficiency we handle each supported peptide length using bulk
+            # array operations.
+            for (length, sub_df) in df.groupby(df.peptide.str.len()):
+                if length < min_length or length > max_length:
+                    raise EncodingError(
+                        "Sequence '%s' (length %d) unsupported. There are %d "
+                        "total peptides with this length." % (
+                            sub_df.iloc[0].peptide,
+                            length,
+                            len(sub_df)), supported_peptide_lengths=(
+                                min_length, max_length))
+
+                # Array of shape (num peptides, length) giving fixed-length
+                # amino acid encoding each peptide of the current length.
+                fixed_length_sequences = numpy.stack(sub_df.peptide.map(
+                    lambda s: numpy.array([
+                        amino_acid.AMINO_ACID_INDEX[char] for char in s
+                    ])).values)
+
+                if alignment_method == "right_pad":
+                    # Left align (i.e. pad right): set left edge
+                    result[sub_df.index, :length] = fixed_length_sequences
+                else:
+                    # Right align: set right edge.
+                    result[sub_df.index, -length:] = fixed_length_sequences
+
         else:
             raise NotImplementedError(
                 "Unsupported alignment method: %s" % alignment_method)
+
+
         return result
diff --git a/test/test_class1_cleavage_neural_network.py b/test/test_class1_cleavage_neural_network.py
new file mode 100644
index 00000000..051a48c3
--- /dev/null
+++ b/test/test_class1_cleavage_neural_network.py
@@ -0,0 +1,65 @@
+import logging
+logging.getLogger('tensorflow').disabled = True
+logging.getLogger('matplotlib').disabled = True
+
+import numpy
+from numpy import testing
+numpy.random.seed(0)
+from tensorflow import set_random_seed
+set_random_seed(2)
+
+from sklearn.metrics import roc_auc_score
+
+from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal
+
+import pandas
+
+from mhcflurry.class1_cleavage_neural_network import Class1CleavageNeuralNetwork
+from mhcflurry.common import random_peptides
+
+from mhcflurry.testing_utils import cleanup, startup
+teardown = cleanup
+setup = startup
+
+
+def test_basic():
+    hyperparameters = dict()
+
+    num = 10000
+
+    df = pandas.DataFrame({
+        "n_flank": random_peptides(num, 10),
+        "c_flank": random_peptides(num, 10),
+        "peptide": random_peptides(num, 9),
+    })
+    df["hit"] = df.peptide.str.get(0).isin(["A", "I", "L"])
+
+    train_df = df.sample(frac=0.1)
+    test_df = df.loc[~df.index.isin(train_df.index)]
+
+    print(
+        "Generated dataset",
+        len(df),
+        "hits: ",
+        df.hit.sum(),
+        "frac:",
+        df.hit.mean())
+
+    network = Class1CleavageNeuralNetwork(**hyperparameters)
+    network.fit(
+        train_df.peptide.values,
+        train_df.n_flank.values,
+        train_df.c_flank.values,
+        train_df.hit.values)
+
+    for df in [train_df, test_df]:
+        df["predictions"] = network.predict(
+            df.peptide.values,
+            df.n_flank.values,
+            df.c_flank.values)
+
+    train_auc = roc_auc_score(train_df.hit.values, train_df.predictions.values)
+    test_auc = roc_auc_score(test_df.hit.values, test_df.predictions.values)
+
+    print("Train auc", train_auc)
+    print("Test auc", test_auc)
-- 
GitLab