refactoring cleavage predictor

c9841e49 · Tim O'Donnell · 6bcd1886 · c9841e49 · c9841e49 · c9841e49
Commit c9841e49 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_cleavage_neural_network.py
+++ b/mhcflurry/class1_cleavage_neural_network.py
@@ -85,7 +85,7 @@ import pandas

 from .hyperparameters import HyperparameterDefaults
 from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
-from .encodable_sequences import EncodableSequences
+from .flanking_encoding import FlankingEncoding


 class Class1CleavageNeuralNetwork(object):
@@ -94,12 +94,11 @@ class Class1CleavageNeuralNetwork(object):
        peptide_max_length=15,
        n_flank_length=10,
        c_flank_length=10,
-        vector_encoding_name="BLOSUM62",
        flanking_averages=False,
        convolutional_filters=16,
        convolutional_kernel_size=8,
-        convolutional_activation="relu",
-        convolutional_kernel_l1_l2=[0.001, 0.001],
+        convolutional_activation="tanh",
+        convolutional_kernel_l1_l2=[0.0001, 0.0001],
        dropout_rate=0.5,
        post_convolutional_dense_layer_sizes=[],
    )
@@ -181,9 +180,7 @@ class Class1CleavageNeuralNetwork(object):

    def fit(
            self,
-            peptides,
-            n_flanks,
-            c_flanks,
+            sequences,
            targets,
            sample_weights=None,
            shuffle_permutation=None,
@@ -205,12 +202,7 @@ class Class1CleavageNeuralNetwork(object):
        -------

        """
-        peptides = EncodableSequences.create(peptides)
-        n_flanks = EncodableSequences.create(n_flanks)
-        c_flanks = EncodableSequences.create(c_flanks)
-
-        x_dict = self.peptides_and_flanking_to_network_input(
-            peptides, n_flanks, c_flanks)
+        x_dict = self.network_input(sequences)

        # Shuffle
        if shuffle_permutation is None:
@@ -301,7 +293,7 @@ class Class1CleavageNeuralNetwork(object):
                progress_callback()

        fit_info["time"] = time.time() - start
-        fit_info["num_points"] = len(peptides)
+        fit_info["num_points"] = len(sequences.dataframe)
        self.fit_info.append(dict(fit_info))

        if verbose:
@@ -317,16 +309,23 @@ class Class1CleavageNeuralNetwork(object):
            n_flanks,
            c_flanks,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
+        sequences = FlankingEncoding(
+            peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
+        return self.predict_encoded(sequences=sequences, batch_size=batch_size)
+
+    def predict_encoded(
+            self,
+            sequences,
+            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        """
        """
-        x_list = self.peptides_and_flanking_to_network_input(
-            peptides, n_flanks, c_flanks)
+        x_dict = self.network_input(sequences)
        raw_predictions = self.network().predict(
-            x_list, batch_size=batch_size)
-        predictions = numpy.array(raw_predictions, dtype="float64")[:,0]
+            x_dict, batch_size=batch_size)
+        predictions = numpy.squeeze(raw_predictions).astype("float64")
        return predictions

-    def peptides_and_flanking_to_network_input(self, peptides, n_flanks, c_flanks):
+    def network_input(self, sequences):
        """
        Encode peptides to the fixed-length encoding expected by the neural
        network (which depends on the architecture).
@@ -339,118 +338,25 @@ class Class1CleavageNeuralNetwork(object):
        -------
        numpy.array
        """
-        peptides = EncodableSequences.create(peptides)
-        peptides_df  = pandas.DataFrame({
-            "peptide": peptides.sequences,
-        })
-        if len(peptides_df) > 0:
-            peptides_df["length"] = peptides_df.peptide.str.len()
-        else:
-            peptides_df["length"] = []
-
-        network_peptide_size = int(
-            self.hyperparameters['peptide_max_length'] +
-            numpy.ceil(
-                self.hyperparameters['convolutional_kernel_size'] / 2))
-
-        result = {}
-        result['peptide_right_pad'] = (
-            peptides.variable_length_to_fixed_length_vector_encoding(
-                vector_encoding_name=self.hyperparameters['vector_encoding_name'],
-                max_length=network_peptide_size,
-                alignment_method='right_pad'))
-        result['peptide_left_pad'] = (
-            peptides.variable_length_to_fixed_length_vector_encoding(
-                vector_encoding_name=self.hyperparameters['vector_encoding_name'],
-                max_length=network_peptide_size,
-                alignment_method='left_pad'))
-
-        flank_needed_to_fill_peptide = (
-            network_peptide_size - peptides_df.length.min())
-
-        if self.hyperparameters['n_flank_length'] > 0:
-            n_flanks = EncodableSequences.create(n_flanks)
-            n_flank_encoded = (
-                n_flanks.variable_length_to_fixed_length_vector_encoding(
-                    vector_encoding_name=self.hyperparameters['vector_encoding_name'],
-                    max_length=max(
-                        self.hyperparameters['n_flank_length'],
-                        flank_needed_to_fill_peptide),
-                    alignment_method='left_pad',
-                    trim=True,
-                    allow_unsupported_amino_acids=True))
-
-            if n_flank_encoded.shape[1] == self.hyperparameters['n_flank_length']:
-                result['n_flank'] = n_flank_encoded
-            else:
-                result['n_flank'] = n_flank_encoded[
-                    :,
-                    (-self.hyperparameters['n_flank_length']):,
-                    :
-                ]
-            assert (
-                result['n_flank'].shape[1] ==
-                self.hyperparameters['n_flank_length'])
-
-            # Switch out X for N-terminal residues in peptide_left_pad.
-            for (length, sub_df) in peptides_df.groupby("length"):
-                num_to_set = network_peptide_size - length
-                result['peptide_left_pad'][
-                    sub_df.index.values,
-                    :num_to_set,
-                    :
-                ] = n_flank_encoded[
-                    sub_df.index.values,
-                    (-num_to_set):,
-                    :
-                ]
-
-        if self.hyperparameters['c_flank_length'] > 0:
-            c_flanks = EncodableSequences.create(c_flanks)
-            c_flank_encoded = (
-                c_flanks.variable_length_to_fixed_length_vector_encoding(
-                    vector_encoding_name=self.hyperparameters['vector_encoding_name'],
-                    max_length=max(
-                        self.hyperparameters['n_flank_length'],
-                        flank_needed_to_fill_peptide),
-                    alignment_method='right_pad',
-                    trim=True,
-                    allow_unsupported_amino_acids=True))
-
-            if c_flank_encoded.shape[1] == self.hyperparameters['c_flank_length']:
-                result['c_flank'] = c_flank_encoded
-            else:
-                result['c_flank'] = c_flank_encoded[
-                    :,
-                    :self.hyperparameters['c_flank_length'],
-                    :
-                ]
-            assert (
-                result['c_flank'].shape[1] ==
-                self.hyperparameters['c_flank_length'])
-
-            # Switch out X for C-terminal residues in peptide_right_pad.
-            for (length, sub_df) in peptides_df.groupby("length"):
-                num_to_set = network_peptide_size - length
-                result['peptide_right_pad'][
-                    sub_df.index.values,
-                    length:,
-                    :
-                ] = result['c_flank'][
-                    sub_df.index.values,
-                    :num_to_set,
-                    :
-                ]
+        encoded = sequences.vector_encode(
+            self.hyperparameters['amino_acid_encoding'],
+            self.hyperparameters['peptide_max_length'],
+            n_flank_length=self.hyperparameters['n_flank_length'],
+            c_flank_length=self.hyperparameters['c_flank_length'],
+            allow_unsupported_amino_acids=True)
+
+        result = {
+            "sequence": encoded.array,
+            "peptide_length": encoded.peptide_lengths,
+        }
        return result

-
    def make_network(
            self,
            amino_acid_encoding,
            peptide_max_length,
            n_flank_length,
            c_flank_length,
-            vector_encoding_name,
            flanking_averages,
            convolutional_filters,
            convolutional_kernel_size,
@@ -467,38 +373,136 @@ class Class1CleavageNeuralNetwork(object):

        from keras.layers import Input
        import keras.layers.pooling
+        import keras.initializers
        from keras.layers.core import Dense, Flatten, Dropout
        from keras.layers.merge import Concatenate

-        empty_x_dict = self.peptides_and_flanking_to_network_input(
-            peptides=[], n_flanks=[], c_flanks=[])
-
        model_inputs = {}

-        model_inputs['peptide_right_pad'] = Input(
-            shape=empty_x_dict['peptide_right_pad'].shape[1:],
-            dtype='float32',
-            name='peptide_right_pad')
-        model_inputs['peptide_left_pad'] = Input(
-            shape=empty_x_dict['peptide_left_pad'].shape[1:],
-            dtype='float32',
-            name='peptide_left_pad')
+        empty_x_dict = self.network_input(FlankingEncoding([], [], []))
+        sequence_dims = empty_x_dict['sequence'].shape[1:]

-        if 'n_flank' in empty_x_dict:
-            model_inputs['n_flank'] = Input(
-                shape=empty_x_dict['n_flank'].shape[1:],
-                dtype='float32',
-                name='n_flank')
+        numpy.testing.assert_equal(
+            sequence_dims[0],
+            peptide_max_length + n_flank_length + c_flank_length)

-        if 'c_flank' in empty_x_dict:
-            model_inputs['c_flank'] = Input(
-                shape=empty_x_dict['c_flank'].shape[1:],
-                dtype='float32',
-                name='c_flank')
+        model_inputs['sequence'] = Input(
+            shape=sequence_dims,
+            dtype='float32',
+            name='sequence')
+        model_inputs['peptide_length'] = Input(
+            shape=(1,),
+            dtype='int32',
+            name='peptide_length')
+
+        current_layer = model_inputs['sequence']
+        current_layer = keras.layers.Conv1D(
+            filters=convolutional_filters,
+            kernel_size=convolutional_kernel_size,
+            kernel_regularizer=keras.regularizers.l1_l2(
+                *convolutional_kernel_l1_l2),
+            padding="same",
+            activation=convolutional_activation,
+            name="conv1")(current_layer)
+        if dropout_rate > 0:
+            current_layer = keras.layers.Dropout(
+                name="conv1_dropout",
+                rate=dropout_rate,
+                noise_shape=(
+                    None, 1, int(current_layer.get_shape()[-1])))(
+                current_layer)
+
+        convolutional_result = current_layer

        outputs_for_final_dense = []

        for flank in ["n_flank", "c_flank"]:
+            current_layer = convolutional_result
+            for (i, size) in enumerate(
+                    list(post_convolutional_dense_layer_sizes) + [1]):
+                current_layer = keras.layers.Conv1D(
+                    name="%s_post_%d" % (flank, i),
+                    filters=size,
+                    kernel_size=1,
+                    kernel_regularizer=keras.regularizers.l1_l2(
+                        *convolutional_kernel_l1_l2),
+                    activation=convolutional_activation)(current_layer)
+            single_output_result = current_layer
+
+            if flank == "n_flank":
+                def cleavage_extractor(x):
+                    return x[:, n_flank_length]
+
+                single_output_at_cleavage_position = keras.layers.Lambda(
+                    cleavage_extractor, name="%s_cleaved" % flank)(
+                    single_output_result)
+            else:
+                assert flank == "c_flank"
+
+                def cleavage_extractor(lst):
+                    import tensorflow as tf
+                    (x, peptide_length) = lst
+                    indexer = peptide_length + n_flank_length - 1
+                    result = tf.squeeze(
+                        tf.gather(x, indexer, batch_dims=1, axis=1),
+                        -1)
+                    return result
+
+                single_output_at_cleavage_position = keras.layers.Lambda(
+                    cleavage_extractor, name="%s_cleaved" % flank)([
+                        single_output_result,
+                        model_inputs['peptide_length']
+                    ])
+
+            outputs_for_final_dense.append(single_output_at_cleavage_position)
+
+
+
+            """
+
+            # Single output at cleavage position
+            single_output_at_cleavage_position = keras.layers.Lambda(
+                cleavage_position_extractor)(single_output_result)
+            outputs_for_final_dense.append(single_output_at_cleavage_position)
+
+            # Max of single-output at non-cleaved (peptide) positions.
+            non_cleaved_single_outputs = keras.layers.Lambda(
+                noncleaved_peptide_extractor, name="%s_noncleaved" % flank)(
+                    single_output_result)
+            non_cleaved_pooled = keras.layers.pooling.GlobalMaxPooling1D(
+                name="%s_noncleaved_pooled" % flank)(non_cleaved_single_outputs)
+            # We flip it so that initializing the final dense layer weights to
+            # 1s is reasonable.
+            non_cleaved_pooled_flipped = keras.layers.Lambda(
+                lambda x: -x,
+                name="%s_noncleaved_pooled_flip" % flank)(non_cleaved_pooled)
+            outputs_for_final_dense.append(non_cleaved_pooled_flipped)
+
+            if include_flank and flanking_averages:
+                # Also include average pooled of flanking sequences
+                extracted_flank = keras.layers.Lambda(
+                    flanking_extractor, name="%s_extracted" % flank)(
+                        convolutional_result)
+                pooled_flank = keras.layers.pooling.GlobalAveragePooling1D(
+                    name="%s_avg" % flank,
+                )(extracted_flank)
+                dense_flank = Dense(
+                    1, activation="tanh", name="%s_avg_dense" % flank)(
+                    pooled_flank)
+                outputs_for_final_dense.append(dense_flank)
+
+
+
+
+
+
+
+
+
+
+
+
+
            include_flank = flank in model_inputs

            if flank == "n_flank":
@@ -587,6 +591,7 @@ class Class1CleavageNeuralNetwork(object):
                    1, activation="tanh", name="%s_avg_dense" % flank)(
                    pooled_flank)
                outputs_for_final_dense.append(dense_flank)
+            """

        if len(outputs_for_final_dense) == 1:
            (current_layer,) = outputs_for_final_dense

--- a/mhcflurry/class1_cleavage_predictor.py
+++ b/mhcflurry/class1_cleavage_predictor.py
@@ -17,7 +17,7 @@ import pandas

 from .version import __version__
 from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
-from .encodable_sequences import EncodableSequences
+from .flanking_encoding import FlankingEncoding
 from .downloads import get_default_class1_cleavage_models_dir
 from .class1_cleavage_neural_network import Class1CleavageNeuralNetwork
 from .common import save_weights, load_weights, NumpyJSONEncoder
@@ -119,6 +119,7 @@ class Class1CleavagePredictor(object):
            n_flanks,
            c_flanks,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
+
        return self.predict_to_dataframe(
            peptides=peptides,
            n_flanks=n_flanks,
@@ -132,34 +133,27 @@ class Class1CleavagePredictor(object):
            c_flanks,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):

-        for (name, value) in [
-                ("peptides", peptides),
-                ("n_flanks", n_flanks),
-                ("c_flanks", c_flanks)]:
-            if isinstance(value, string_types):
-                raise TypeError(
-                    "%s must be a list or array, not a string" % name)
+        sequences = FlankingEncoding(
+            peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
+        return self.predict_to_dataframe_encoded(
+            sequences=sequences, batch_size=batch_size)

-        peptides = EncodableSequences.create(peptides)
-        n_flanks = EncodableSequences.create(n_flanks)
-        c_flanks = EncodableSequences.create(c_flanks)
+    def predict_to_dataframe_encoded(
+            self, sequences, batch_size=DEFAULT_PREDICT_BATCH_SIZE):

        score_array = []

        for (i, network) in enumerate(self.models):
-            predictions = network.predict(
-                peptides=peptides,
-                n_flanks=n_flanks,
-                c_flanks=c_flanks,
-                batch_size=batch_size)
+            predictions = network.predict_encoded(
+                encoded, batch_size=batch_size)
            score_array.append(predictions)

        score_array = numpy.array(score_array)

        result_df = pandas.DataFrame({
-            "peptide": peptides.sequences,
-            "n_flank": n_flanks.sequences,
-            "c_flank": c_flanks.sequences,
+            "peptide": encoded.dataframe.peptide,
+            "n_flank": encoded.dataframe.n_flank,
+            "c_flank": encoded.dataframe.c_flank,
            "score": numpy.mean(score_array, axis=0),
        })
        return result_df

--- a/mhcflurry/flanking_encoding.py
+++ b/mhcflurry/flanking_encoding.py
+"""
+Class for encoding variable-length flanking and peptides to
+fixed-size numerical matrices
+"""
+from __future__ import (
+    print_function, division, absolute_import, )
+
+from six import string_types
+from collections import namedtuple
+
+from .encodable_sequences import EncodingError, EncodableSequences
+
+import numpy
+import pandas
+
+
+EncodingResult =  namedtuple(
+    "EncodingResult", ["array", "peptide_lengths"])
+
+class FlankingEncoding(object):
+    """
+    """
+    unknown_character = "X"
+
+    def __init__(self, peptides, n_flanks, c_flanks):
+        self.dataframe = pandas.DataFrame({
+            "peptide": peptides,
+            "n_flank": n_flanks,
+            "c_flank": c_flanks,
+        }, dtype=str)
+        self.encoding_cache = {}
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def vector_encode(
+            self,
+            vector_encoding_name,
+            peptide_max_length,
+            n_flank_length,
+            c_flank_length,
+            allow_unsupported_amino_acids=True):
+        """
+        Encode variable-length sequences to a fixed-size matrix. Amino acids
+        are encoded as specified by the vector_encoding_name argument.
+
+        See `sequences_to_fixed_length_index_encoded_array` for details.
+
+        See also: variable_length_to_fixed_length_categorical.
+
+        Parameters
+        ----------
+        vector_encoding_name : string
+            How to represent amino acids.
+            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
+            encodings is given by available_vector_encodings().
+        alignment_method : string
+            One of "pad_middle" or "left_pad_right_pad"
+        left_edge : int, size of fixed-position left side
+            Only relevant for pad_middle alignment method
+        right_edge : int, size of the fixed-position right side
+            Only relevant for pad_middle alignment method
+        max_length : maximum supported peptide length
+
+        Returns
+        -------
+        numpy.array with shape (num sequences, encoded length, m)
+
+        where
+            - m is the vector encoding length (usually 21).
+            - encoded length is max_length if alignment_method is pad_middle;
+              3 * max_length if it's left_pad_right_pad.
+        """
+        cache_key = (
+            "vector_encode",
+            vector_encoding_name,
+            peptide_max_length,
+            n_flank_length,
+            c_flank_length,
+            allow_unsupported_amino_acids)
+        if cache_key not in self.encoding_cache:
+            result = self.encode(
+                vector_encoding_name=vector_encoding_name,
+                df=self.dataframe,
+                peptide_max_length=peptide_max_length,
+                n_flank_length=n_flank_length,
+                c_flank_length=c_flank_length,
+                allow_unsupported_amino_acids=allow_unsupported_amino_acids)
+            self.encoding_cache[cache_key] = result
+        return self.encoding_cache[cache_key]
+
+    @staticmethod
+    def encode(
+        vector_encoding_name,
+        df,
+        peptide_max_length,
+        n_flank_length,
+        c_flank_length,
+        allow_unsupported_amino_acids=False):
+        """
+        """
+        error_df = df.loc[
+            (df.peptide.str.len() > peptide_max_length) |
+            (df.peptide.str.len() < 1)
+        ]
+        if len(error_df) > 0:
+            raise EncodingError(
+                "Sequence '%s' (length %d) unsupported. There are %d "
+                "total peptides with this length." % (
+                    error_df.iloc[0].peptide,
+                    len(error_df.iloc[0].peptide),
+                    len(error_df)))
+
+        if n_flank_length > 0:
+            n_flanks = df.n_flank.str.pad(
+                n_flank_length,
+                side="left",
+                fillchar="X").str.slice(-n_flank_length).str.upper()
+        else:
+            n_flanks = pandas.Series([""] * len(df))
+
+        c_flanks = df.c_flank.str.pad(
+            c_flank_length,
+            side="right",
+            fillchar="X").str.slice(0, c_flank_length).str.upper()
+        peptides = df.peptide.str.upper()
+
+        concatenated = n_flanks + peptides + c_flanks
+
+        encoder = EncodableSequences.create(concatenated.values)
+        array = encoder.variable_length_to_fixed_length_vector_encoding(
+            vector_encoding_name=vector_encoding_name,
+            alignment_method="right_pad",
+            max_length=n_flank_length + peptide_max_length + c_flank_length,
+            allow_unsupported_amino_acids=allow_unsupported_amino_acids)
+
+        result = EncodingResult(
+            array, peptide_lengths=peptides.str.len().values)
+
+        return result
--- a/test/test_class1_cleavage_neural_network.py
+++ b/test/test_class1_cleavage_neural_network.py
@@ -18,17 +18,35 @@ import pprint
 from mhcflurry.class1_cleavage_neural_network import Class1CleavageNeuralNetwork
 from mhcflurry.common import random_peptides
 from mhcflurry.amino_acid import BLOSUM62_MATRIX
+from mhcflurry.flanking_encoding import FlankingEncoding

 from mhcflurry.testing_utils import cleanup, startup
 teardown = cleanup
 setup = startup


-def Xtest_neural_network_input():
+table = BLOSUM62_MATRIX.apply(
+    tuple).reset_index().set_index(0).to_dict()['index']
+
+
+def decode_matrix(array):
+    (num, length, dim) = array.shape
+    assert dim == 21
+
+    results = []
+    for row in array:
+        item = "".join([
+            table[tuple(x)] for x in row
+        ])
+        results.append(item)
+    return results
+
+
+def test_neural_network_input():
    model = Class1CleavageNeuralNetwork(
        peptide_max_length=12,
        n_flank_length=8,
-        c_flank_length=8)
+        c_flank_length=5)

    tests = [
        {
@@ -38,25 +56,66 @@ def Xtest_neural_network_input():
            "c_flank": "FGHKLCVNMQWE",

            # Expected results
-            "peptide_left_pad": "",
-            "peptide_right_pad": "",
-            "c_flank": "",
+            "sequence": "TYIPSDFGSIINFEKLFGHKLXXXX",
+        },
+        {
+            # Input
+            "peptide": "QCV",
+            "n_flank": "QWERTYIPSDFG",
+            "c_flank": "FGHKLCVNMQWE",
+
+            # Expected results
+            "sequence": "TYIPSDFGQCVFGHKLXXXXXXXXX",
+        },
+        {
+            # Input
+            "peptide": "QCVSQCVSQCVS",
+            "n_flank": "QWE",
+            "c_flank": "MNV",
+
+            # Expected results
+            "sequence": "XXXXXQWEQCVSQCVSQCVSMNVXX",
+        },
+        {
+            # Input
+            "peptide": "QCVSQCVSQCVS",
+            "n_flank": "",
+            "c_flank": "MNV",
+
+            # Expected results
+            "sequence": "XXXXXXXXQCVSQCVSQCVSMNVXX",
+        },
+        {
+            # Input
+            "peptide": "QCVSQCVSQCVS",
            "n_flank": "",
+            "c_flank": "",

-        }
+            # Expected results
+            "sequence": "XXXXXXXXQCVSQCVSQCVSXXXXX",
+        },
    ]

    for (i, d) in enumerate(tests):
-        print("Test", i + 1)
-        pprint.pprint(d)
-        results = model.peptides_and_flanking_to_network_input(
+        encoding = FlankingEncoding(
            peptides=[d['peptide']],
            n_flanks=[d['n_flank']],
-            c_flanks=[d['n_flank']])
-        import ipdb ; ipdb.set_trace()
+            c_flanks=[d['c_flank']])
+
+        results = model.network_input(encoding)
+        (decoded,) = decode_matrix(results['sequence'])

+        numpy.testing.assert_equal(decoded, d['sequence'])
+        numpy.testing.assert_equal(results['peptide_length'], len(d['peptide']))

-    model.peptides_and_flanking_to_network_input()
+    # Test all at once
+    df = pandas.DataFrame(tests)
+    encoding = FlankingEncoding(df.peptide, df.n_flank, df.c_flank)
+    results = model.network_input(encoding)
+    df["decoded"] = decode_matrix(results['sequence'])
+    numpy.testing.assert_array_equal(df.decoded.values, df.sequence.values)
+    numpy.testing.assert_equal(
+        results['peptide_length'], df.peptide.str.len().values)


 def Xtest_big():
@@ -77,9 +136,29 @@ def Xtest_more():
        post_convolutional_dense_layer_sizes=[8])


-def train_basic_network(num, do_assertions=True, **hyperparameters):
+def test_basic_indexing(num=10000, do_assertions=True, **hyperparameters):
+    def is_hit(n_flank, c_flank, peptide):
+        return peptide[0] in "SIQVL" and peptide[-1] in "YIPASD"
+
+    def is_hit1(n_flank, c_flank, peptide):
+        return peptide[0] in "SIQVL"
+
+    def is_hit2(n_flank, c_flank, peptide):
+        return peptide[-1] in "YIPASD"
+
+    hypers = {
+        "convolutional_kernel_size": 1,
+        "flanking_averages": False,
+    }
+
+    train_basic_network(num=10000, is_hit=is_hit1, **hypers)
+    train_basic_network(num=10000, is_hit=is_hit2, **hypers)
+    train_basic_network(num=10000, is_hit=is_hit, **hypers)
+
+
+def train_basic_network(num, do_assertions=True, is_hit=None, **hyperparameters):
    use_hyperparameters = {
-        "max_epochs": 30,
+        "max_epochs": 100,
        "peptide_max_length": 12,
        "n_flank_length": 8,
        "c_flank_length": 8,
@@ -93,12 +172,13 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):
        "peptide": random_peptides(num / 2, 11) + random_peptides(num / 2, 8),
    }).sample(frac=1.0)

-    n_cleavage_regex = "[AILQSV][SINFEKLH][MNPQYK]"
+    if is_hit is None:
+        n_cleavage_regex = "[AILQSV][SINFEKLH][MNPQYK]"

-    def is_hit(n_flank, c_flank, peptide):
-        if re.search(n_cleavage_regex, peptide):
-            return False  # peptide is cleaved
-        return bool(re.match(n_cleavage_regex, n_flank[-1:] + peptide))
+        def is_hit(n_flank, c_flank, peptide):
+            if re.search(n_cleavage_regex, peptide):
+                return False  # peptide is cleaved
+            return bool(re.match(n_cleavage_regex, n_flank[-1:] + peptide))

    df["hit"] = [
        is_hit(row.n_flank, row.c_flank, row.peptide)
@@ -118,18 +198,24 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):

    network = Class1CleavageNeuralNetwork(**use_hyperparameters)
    network.fit(
-        train_df.peptide.values,
-        train_df.n_flank.values,
-        train_df.c_flank.values,
-        train_df.hit.values,
+        sequences=FlankingEncoding(
+            peptides=train_df.peptide.values,
+            n_flanks=train_df.n_flank.values,
+            c_flanks=train_df.c_flank.values),
+        targets=train_df.hit.values,
        verbose=0)

+    network.network().summary()
+
    for df in [train_df, test_df]:
        df["predictions"] = network.predict(
            df.peptide.values,
            df.n_flank.values,
            df.c_flank.values)

+    print(df)
+    print(df.loc[df.hit])
+
    train_auc = roc_auc_score(train_df.hit.values, train_df.predictions.values)
    test_auc = roc_auc_score(test_df.hit.values, test_df.predictions.values)

@@ -141,3 +227,4 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):
        assert_greater(test_auc, 0.85)

    return network
+