Add models_class1_experimental_variations1

3239e437 · Tim O'Donnell · f73e73b2 · 3239e437 · 3239e437 · 3239e437
Commit 3239e437 authored 7 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_experimental_variations1/GENERATE.sh
+++ b/downloads-generation/models_class1_experimental_variations1/GENERATE.sh
+#!/bin/bash
+
+set -e
+set -x
+
+DOWNLOAD_NAME=models_class1
+SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+mkdir models
+
+cp $SCRIPT_DIR/hyperparameters.json .
+
+time mhcflurry-class1-train-allele-specific-models \
+    --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
+    --hyperparameters hyperparameters.json \
+    --out-models-dir models \
+    --min-measurements-per-allele 100
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 LOG.txt
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/models_class1_experimental_variations1/README.md
+++ b/downloads-generation/models_class1_experimental_variations1/README.md
+# Experimental class I allele-specific models (ensemble)
+
+This download contains trained MHC Class I allele-specific MHCflurry models
+using a variety of experimental architectures. These were generated for a
+publication and are not intended for production use.
\ No newline at end of file
--- a/downloads-generation/models_class1_experimental_variations1/hyperparameters.json
+++ b/downloads-generation/models_class1_experimental_variations1/hyperparameters.json
+[
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            },
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            32
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.001,
+        "dropout_probability": 0.0
+    },
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            32
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.001,
+        "dropout_probability": 0.0
+    },
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            32
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.001,
+        "dropout_probability": 0.0
+    },
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            },
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            32
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.0,
+        "dropout_probability": 0.0
+    },
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            },
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            64
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.001,
+        "dropout_probability": 0.0
+    },
+    {
+        "n_models": 8,
+        "max_epochs": 500,
+        "patience": 10,
+        "early_stopping": true,
+        "validation_split": 0.2,
+        "random_negative_rate": 0.0,
+        "random_negative_constant": 25,
+        "use_embedding": false,
+        "kmer_size": 15,
+        "batch_normalization": false,
+        "locally_connected_layers": [
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            },
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        "activation": "relu",
+        "output_activation": "sigmoid",
+        "layer_sizes": [
+            16
+        ],
+        "random_negative_affinity_min": 20000.0,
+        "random_negative_affinity_max": 50000.0,
+        "dense_layer_l1_regularization": 0.001,
+        "dropout_probability": 0.0
+    }
+]
--- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py
+++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
@@ -22,6 +22,15 @@ from ..common import random_peptides, amino_acid_distribution


 class Class1NeuralNetwork(object):
+    """
+    Low level class I predictor consisting of a single neural network.
+    
+    Both single allele and pan-allele prediction are supported, but pan-allele
+    is in development and not yet well performing.
+    
+    Users will generally use Class1AffinityPredictor, which gives a higher-level
+    interface and supports ensembles.
+    """
    weights_filename_extension = "npz"

    network_hyperparameter_defaults = HyperparameterDefaults(
@@ -85,6 +94,13 @@ class Class1NeuralNetwork(object):
        self.fit_num_points = None

    def get_config(self):
+        """
+        serialize to a dict all attributes except model weights
+        
+        Returns
+        -------
+        dict
+        """
        result = dict(self.__dict__)
        del result['network']
        result['network_json'] = self.network.to_json()
@@ -92,6 +108,21 @@ class Class1NeuralNetwork(object):

    @classmethod
    def from_config(cls, config):
+        """
+        deserialize from a dict returned by get_config().
+        
+        The weights of the neural network are not restored by this function.
+        You must call `restore_weights` separately.
+        
+        Parameters
+        ----------
+        config : dict
+
+        Returns
+        -------
+        Class1NeuralNetwork
+
+        """
        config = dict(config)
        instance = cls(**config.pop('hyperparameters'))
        instance.network = keras.models.model_from_json(
@@ -100,11 +131,28 @@ class Class1NeuralNetwork(object):
        return instance

    def __getstate__(self):
+        """
+        serialize to a dict. Model weights are included. For pickle support.
+        
+        Returns
+        -------
+        dict
+
+        """
        result = self.get_config()
        result['network_weights'] = self.get_weights()
        return result

    def __setstate__(self, state):
+        """
+        deserialize from a dict. Model weights are included. For pickle support.
+        
+        Parameters
+        ----------
+        state : dict
+
+
+        """
        network_json = state.pop('network_json')
        network_weights = state.pop('network_weights')
        self.__dict__.update(state)
@@ -112,12 +160,32 @@ class Class1NeuralNetwork(object):
        self.set_weights(network_weights)

    def save_weights(self, filename):
+        """
+        Save the model weights to the given filename using numpy's ".npz"
+        format.
+        
+        Parameters
+        ----------
+        filename : string
+            Should end in ".npz".
+
+        """
        weights_list = self.network.get_weights()
        numpy.savez(
            filename,
            **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))

    def restore_weights(self, filename):
+        """
+        Restore model weights from the given filename, which should have been
+        created with `save_weights`.
+        
+        Parameters
+        ----------
+        filename : string
+            Should end in ".npz".
+
+        """
        loaded = numpy.load(filename)
        weights = [
            loaded["array_%d" % i]
@@ -127,6 +195,18 @@ class Class1NeuralNetwork(object):
        self.network.set_weights(weights)

    def peptides_to_network_input(self, peptides):
+        """
+        Encode peptides to the fixed-length encoding expected by the neural
+        network (which depends on the architecture).
+        
+        Parameters
+        ----------
+        peptides : EncodableSequences or list of string
+
+        Returns
+        -------
+        numpy.array
+        """
        encoder = EncodableSequences.create(peptides)
        if self.hyperparameters['use_embedding']:
            encoded = encoder.variable_length_to_fixed_length_categorical(
@@ -142,6 +222,18 @@ class Class1NeuralNetwork(object):
        return encoded

    def pseudosequence_to_network_input(self, pseudosequences):
+        """
+        Encode pseudosequences to the fixed-length encoding expected by the neural
+        network (which depends on the architecture).
+
+        Parameters
+        ----------
+        pseudosequences : EncodableSequences or list of string
+
+        Returns
+        -------
+        numpy.array
+        """
        encoder = EncodableSequences.create(pseudosequences)
        if self.hyperparameters['pseudosequence_use_embedding']:
            encoded = encoder.fixed_length_categorical()
@@ -157,6 +249,26 @@ class Class1NeuralNetwork(object):
            allele_pseudosequences=None,
            sample_weights=None,
            verbose=1):
+        """
+        Fit the neural network.
+        
+        Parameters
+        ----------
+        peptides : EncodableSequences or list of string
+        
+        affinities : list of float
+        
+        allele_pseudosequences : EncodableSequences or list of string, optional
+            If not specified, the model will be a single-allele predictor.
+            
+        sample_weights : list of float, optional
+            If not specified, all samples (including random negatives added
+            during training) will have equal weight. If specified, the random
+            negatives will be assigned weight=1.0.
+        
+        verbose : int
+            Keras verbosity level
+        """

        self.fit_num_points = len(peptides)

@@ -294,6 +406,17 @@ class Class1NeuralNetwork(object):
        self.fit_seconds = time.time() - start

    def predict(self, peptides, allele_pseudosequences=None):
+        """
+        
+        Parameters
+        ----------
+        peptides
+        allele_pseudosequences
+
+        Returns
+        -------
+
+        """
        x_dict = {
            'peptide': self.peptides_to_network_input(peptides)
        }