Merge pull request #145 from openvax/v1.3

pan allele prediction (MHCflurry 1.3.0)

Merge pull request #145 from openvax/v1.3
pan allele prediction (MHCflurry 1.3.0)
18d0bd99 · Tim O'Donnell · GitHub · 74b751e6 · 3cbdfd69 · 18d0bd99
Unverified Commit 18d0bd99 authored 5 years ago by Tim O'Donnell Committed by GitHub 5 years ago
--- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
+++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
+"""
+Generate grid of hyperparameters
+"""
+
+from sys import stdout
+from copy import deepcopy
+from yaml import dump
+
+base_hyperparameters = {
+    'activation': 'tanh',
+    'allele_dense_layer_sizes': [],
+    'batch_normalization': False,
+    'dense_layer_l1_regularization': 0.0,
+    'dense_layer_l2_regularization': 0.0,
+    'dropout_probability': 0.5,
+    'early_stopping': True,
+    'init': 'glorot_uniform',
+    'layer_sizes': [1024, 512],
+    'learning_rate': 0.001,
+    'locally_connected_layers': [],
+    'loss': 'custom:mse_with_inequalities',
+    'max_epochs': 5000,
+    'minibatch_size': 128,
+    'optimizer': 'rmsprop',
+    'output_activation': 'sigmoid',
+    "patience": 20,
+    "min_delta": 0.0,
+    'peptide_encoding': {
+        'vector_encoding_name': 'BLOSUM62',
+        'alignment_method': 'left_pad_centered_right_pad',
+        'max_length': 15,
+    },
+    'peptide_allele_merge_activation': '',
+    'peptide_allele_merge_method': 'concatenate',
+    'peptide_amino_acid_encoding': 'BLOSUM62',
+    'peptide_dense_layer_sizes': [],
+    'random_negative_affinity_max': 50000.0,
+    'random_negative_affinity_min': 20000.0,
+    'random_negative_constant': 1500,
+    'random_negative_distribution_smoothing': 0.0,
+    'random_negative_match_distribution': True,
+    'random_negative_rate': 0.2,
+    'train_data': {
+        'pretrain': True,
+        'pretrain_peptides_per_epoch': 64,
+        'pretrain_steps_per_epoch': 256,
+        'pretrain_patience': 2,
+        'pretrain_min_delta': 0.0001,
+        'pretrain_max_val_loss': 0.10,
+        'pretrain_max_epochs': 50,
+        'pretrain_min_epochs': 5,
+    },
+    'validation_split': 0.1,
+    'data_dependent_initialization_method': "lsuv",
+}
+
+grid = []
+for layer_sizes in [[512, 256], [512, 512], [1024, 512], [1024, 1024]]:
+    for pretrain in [True]:
+        l1_base = 0.0000001
+        for l1 in [l1_base, l1_base / 10, l1_base / 100, l1_base / 1000, 0.0]:
+            for lr in [0.001, 0.01]:
+                new = deepcopy(base_hyperparameters)
+                new["layer_sizes"] = layer_sizes
+                new["dense_layer_l1_regularization"] = l1
+                new["train_data"]["pretrain"] = pretrain
+                new["learning_rate"] = lr
+                if not grid or new not in grid:
+                    grid.append(new)
+
+dump(grid, stdout)
--- a/downloads-generation/random_peptide_predictions/GENERATE.sh
+++ b/downloads-generation/random_peptide_predictions/GENERATE.sh
+#!/bin/bash
+#
+# Generate predictions for random peptides. Used for pre-training some models.
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=random_peptide_predictions
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+export PYTHONUNBUFFERED=1
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+cp $SCRIPT_DIR/random_predictions.py .
+cp $SCRIPT_ABSOLUTE_PATH .
+
+time python random_predictions.py \
+    --num-peptides 5000000 \
+    --models "$(mhcflurry-downloads path models_class1_selected_no_mass_spec)/models" \
+    --out predictions.csv
+
+bzip2 predictions.csv
+bzip2 LOG.txt
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/random_peptide_predictions/random_predictions.py
+++ b/downloads-generation/random_peptide_predictions/random_predictions.py
+"""
+Generate predictions for random peptides.
+"""
+from __future__ import print_function
+
+import sys
+import argparse
+import time
+import math
+
+import pandas
+
+import mhcflurry
+from mhcflurry.common import random_peptides
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument("--models", required=True)
+parser.add_argument("--num-peptides", type=int)
+parser.add_argument("--out", required=True)
+parser.add_argument("--chunksize", type=int, default=10000)
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    print(args)
+
+    predictor = mhcflurry.Class1AffinityPredictor.load(args.models)
+
+    alleles = pandas.Series(predictor.supported_alleles)
+
+    # Clear the file
+    pandas.DataFrame(columns=alleles).to_csv(args.out, index=True)
+
+    (min_length, max_length) = predictor.supported_peptide_lengths
+
+    peptides_per_length = int(
+        math.ceil(args.chunksize / (max_length - min_length)))
+
+    peptides_written = 0
+    i = 0
+    while peptides_written < args.num_peptides:
+        print("Chunk %d / %d" % (
+            i + 1, math.ceil(args.num_peptides / args.chunksize)))
+        start = time.time()
+        peptides = []
+        for l in range(8, 16):
+            peptides.extend(random_peptides(peptides_per_length, length=l))
+
+        peptides = pandas.Series(peptides).sample(
+            n=min(args.chunksize, args.num_peptides - peptides_written)).values
+        encodable_peptides = mhcflurry.encodable_sequences.EncodableSequences.create(
+            peptides)
+        df = pandas.DataFrame(index=peptides)
+        for allele in alleles:
+            df[allele] = predictor.predict(encodable_peptides, allele=allele)
+        df.to_csv(
+            args.out, index=True, mode='a', header=False, float_format='%.1f')
+        print("Wrote: %s  [%0.2f sec]" % (args.out, time.time() - start))
+        i += 1
+        peptides_written += len(peptides)
+
+    print("Done.")
+
+if __name__ == '__main__':
+    run()
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
+"""
+Class I MHC ligand prediction package
+"""
+
 from .class1_affinity_predictor import Class1AffinityPredictor
 from .class1_neural_network import Class1NeuralNetwork
 from .version import __version__

--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
-import numpy
 import pandas

-from .encodable_sequences import EncodableSequences
 from . import amino_acid

+
 class AlleleEncoding(object):
-    def __init__(
-            self,
-            alleles,
-            allele_to_fixed_length_sequence=None):
+    def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None):
        """
-        A place to cache encodings for a (potentially large) sequence of alleles.
+        A place to cache encodings for a sequence of alleles.
+
+        We frequently work with alleles by integer indices, for example as
+        inputs to neural networks. This class is used to map allele names to
+        integer indices in a consistent way by keeping track of the universe
+        of alleles under use, i.e. a distinction is made between the universe
+        of supported alleles (what's in `allele_to_sequence`) and the actual
+        set of alleles used (what's in `alleles`).

        Parameters
        ----------
        alleles : list of string
            Allele names

-        allele_to_fixed_length_sequence : dict of str -> str
-            Allele name to fixed lengths sequence ("pseudosequence")
+        allele_to_sequence : dict of str -> str
+            Allele name to amino acid sequence
+
+        borrow_from : AlleleEncoding, optional
+            If specified, do not specify allele_to_sequence. The sequences from
+            the provided instance are used. This guarantees that the mappings
+            from allele to index and from allele to sequence are the same
+            between the instances.
        """

-        alleles = pandas.Series(alleles)
+        if alleles is not None:
+            alleles = pandas.Series(alleles)
+        self.borrow_from = borrow_from
+        self.allele_to_sequence = allele_to_sequence

-        all_alleles = list(sorted(alleles.unique()))
+        if self.borrow_from is None:
+            assert allele_to_sequence is not None
+            all_alleles = (
+                sorted(allele_to_sequence))
+            self.allele_to_index = dict(
+                (allele, i)
+                for (i, allele) in enumerate(all_alleles))
+            unpadded = pandas.Series(
+                [allele_to_sequence[a] for a in all_alleles],
+                index=all_alleles)
+            self.sequences = unpadded.str.pad(
+                unpadded.str.len().max(), fillchar="X")
+        else:
+            assert allele_to_sequence is None
+            self.allele_to_index = borrow_from.allele_to_index
+            self.sequences = borrow_from.sequences
+            self.allele_to_sequence = borrow_from.allele_to_sequence

-        self.allele_to_index = dict(
-            (allele, i)
-            for (i, allele) in enumerate(all_alleles))
+        if alleles is not None:
+            assert all(
+                allele in self.allele_to_index for allele in alleles),\
+                "Missing alleles: " + " ".join(set(
+                    a for a in alleles if a not in self.allele_to_index))
+            self.indices = alleles.map(self.allele_to_index)
+            assert not self.indices.isnull().any()
+            self.alleles = alleles
+        else:
+            self.indices = None
+            self.alleles = None

-        self.indices = alleles.map(self.allele_to_index)
+        self.encoding_cache = {}

-        self.fixed_length_sequences = pandas.Series(
-            [allele_to_fixed_length_sequence[a] for a in all_alleles],
-            index=all_alleles)
+    def compact(self):
+        """
+        Return a new AlleleEncoding in which the universe of supported alleles
+        is only the alleles actually used.

-        self.encoding_cache = {}
+        Returns
+        -------
+        AlleleEncoding
+        """
+        return AlleleEncoding(
+            alleles=self.alleles,
+            allele_to_sequence=dict(
+                (allele, self.allele_to_sequence[allele])
+                for allele in self.alleles.unique()))

-    def fixed_length_vector_encoded_sequences(self, vector_encoding_name):
+    def allele_representations(self, encoding_name):
        """
-        Encode alleles.
+        Encode the universe of supported allele sequences to a matrix.

        Parameters
        ----------
-        vector_encoding_name : string
-            How to represent amino acids.
-            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
-            encodings is given by available_vector_encodings() in amino_acid.
+        encoding_name : string
+            How to represent amino acids. Valid names are "BLOSUM62" or
+            "one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.

        Returns
        -------
-        numpy.array with shape (num sequences, sequence length, m) where m is
-        vector_encoding_length(vector_encoding_name)
+        numpy.array of shape
+            (num alleles in universe, sequence length, vector size)
+        where vector size is usually 21 (20 amino acids + X character)
        """
+        if self.borrow_from is not None:
+            return self.borrow_from.allele_representations(encoding_name)
+
        cache_key = (
-            "fixed_length_vector_encoding",
-            vector_encoding_name)
+            "allele_representations",
+            encoding_name)
        if cache_key not in self.encoding_cache:
            index_encoded_matrix = amino_acid.index_encoding(
-                self.fixed_length_sequences.values,
+                self.sequences.values,
                amino_acid.AMINO_ACID_INDEX)
            vector_encoded = amino_acid.fixed_vectors_encoding(
                index_encoded_matrix,
-                amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
-            result = vector_encoded[self.indices]
-            self.encoding_cache[cache_key] = result
+                amino_acid.ENCODING_DATA_FRAMES[encoding_name])
+            self.encoding_cache[cache_key] = vector_encoded
        return self.encoding_cache[cache_key]

+    def fixed_length_vector_encoded_sequences(self, encoding_name):
+        """
+        Encode allele sequences (not the universe of alleles) to a matrix.
+
+        Parameters
+        ----------
+        encoding_name : string
+            How to represent amino acids. Valid names are "BLOSUM62" or
+            "one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.

+        Returns
+        -------
+        numpy.array with shape:
+            (num alleles, sequence length, vector size)
+        where vector size is usually 21 (20 amino acids + X character)
+        """
+        cache_key = (
+            "fixed_length_vector_encoding",
+            encoding_name)
+        if cache_key not in self.encoding_cache:
+            vector_encoded = self.allele_representations(encoding_name)
+            result = vector_encoded[self.indices]
+            self.encoding_cache[cache_key] = result
+        return self.encoding_cache[cache_key]
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -68,7 +68,7 @@ W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
 Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
 V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
 X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
-"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
+"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS].astype("int8")
 assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()

 ENCODING_DATA_FRAMES = {
@@ -153,7 +153,7 @@ def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
    target_shape = (
        num_sequences, sequence_length, letter_to_vector_df.shape[0])
    result = letter_to_vector_df.iloc[
-        index_encoded_sequences.flatten()
+        index_encoded_sequences.reshape((-1,))  # reshape() avoids copy
    ].values.reshape(target_shape)
    return result

--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -7,23 +7,25 @@ import signal
 import sys
 import time
 import traceback
-import random
+import collections
 from functools import partial

-import numpy
 import pandas
-import yaml
-from sklearn.metrics.pairwise import cosine_similarity
+
 from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481

 from .class1_affinity_predictor import Class1AffinityPredictor
-from .common import configure_logging
-from .parallelism import (
-    add_worker_pool_args,
+from .encodable_sequences import EncodableSequences
+from .common import configure_logging, random_peptides, amino_acid_distribution
+from .local_parallelism import (
+    add_local_parallelism_args,
    worker_pool_with_gpu_assignments_from_args,
-    call_wrapped)
+    call_wrapped_kwargs)
+from .cluster_parallelism import (
+    add_cluster_parallelism_args,
+    cluster_results_from_args)


 # To avoid pickling large matrices to send to child processes when running in
@@ -44,8 +46,13 @@ parser.add_argument(
    "--allele",
    default=None,
    nargs="+",
-    help="Alleles to train models for. If not specified, all alleles with "
-    "enough measurements will be used.")
+    help="Alleles to calibrate percentile ranks for. If not specified all "
+    "alleles are used")
+parser.add_argument(
+    "--match-amino-acid-distribution-data",
+    help="Sample random peptides from the amino acid distribution of the "
+    "peptides listed in the supplied CSV file, which must have a 'peptide' "
+    "column. If not specified a uniform distribution is used.")
 parser.add_argument(
    "--num-peptides-per-length",
    type=int,
@@ -53,13 +60,40 @@ parser.add_argument(
    default=int(1e5),
    help="Number of peptides per length to use to calibrate percent ranks. "
    "Default: %(default)s.")
+parser.add_argument(
+    "--motif-summary",
+    default=False,
+    action="store_true",
+    help="Calculate motifs and length preferences for each allele")
+parser.add_argument(
+    "--summary-top-peptide-fraction",
+    default=[0.0001, 0.001, 0.01, 0.1, 1.0],
+    nargs="+",
+    type=float,
+    metavar="X",
+    help="The top X fraction of predictions (i.e. tightest binders) to use to "
+    "generate motifs and length preferences. Default: %(default)s")
+parser.add_argument(
+    "--length-range",
+    default=(8, 15),
+    type=int,
+    nargs=2,
+    help="Min and max peptide length to calibrate, inclusive. "
+    "Default: %(default)s")
+parser.add_argument(
+    "--prediction-batch-size",
+    type=int,
+    default=4096,
+    help="Keras batch size for predictions")
 parser.add_argument(
    "--verbosity",
    type=int,
    help="Keras verbosity. Default: %(default)s",
    default=0)

-add_worker_pool_args(parser)
+add_local_parallelism_args(parser)
+add_cluster_parallelism_args(parser)
+

 def run(argv=sys.argv[1:]):
    global GLOBAL_DATA
@@ -81,49 +115,94 @@ def run(argv=sys.argv[1:]):
    else:
        alleles = predictor.supported_alleles

+    distribution = None
+    if args.match_amino_acid_distribution_data:
+        distribution_peptides = pandas.read_csv(
+            args.match_amino_acid_distribution_data).peptide.unique()
+        distribution = amino_acid_distribution(distribution_peptides)
+        print("Using amino acid distribution:")
+        print(distribution)
+
+    start = time.time()
+
+    print("Percent rank calibration for %d alleles. Generating peptides." % (
+        len(alleles)))
+    peptides = []
+    lengths = range(args.length_range[0], args.length_range[1] + 1)
+    for length in lengths:
+        peptides.extend(
+            random_peptides(
+                args.num_peptides_per_length, length, distribution=distribution))
+    print("Done generating peptides in %0.2f sec." % (time.time() - start))
+    print("Encoding %d peptides." % len(peptides))
    start = time.time()

-    print("Performing percent rank calibration. Encoding peptides.")
-    encoded_peptides = predictor.calibrate_percentile_ranks(
-        alleles=[],  # don't actually do any calibration, just return peptides
-        num_peptides_per_length=args.num_peptides_per_length)
+    encoded_peptides = EncodableSequences.create(peptides)
+    del peptides

    # Now we encode the peptides for each neural network, so the encoding
    # becomes cached.
    for network in predictor.neural_networks:
        network.peptides_to_network_input(encoded_peptides)
    assert encoded_peptides.encoding_cache  # must have cached the encoding
-    print("Finished encoding peptides for percent ranks in %0.2f sec." % (
-        time.time() - start))
-    print("Calibrating percent rank calibration for %d alleles." % len(alleles))
+    print("Finished encoding peptides in %0.2f sec." % (time.time() - start))

    # Store peptides in global variable so they are in shared memory
    # after fork, instead of needing to be pickled (when doing a parallel run).
    GLOBAL_DATA["calibration_peptides"] = encoded_peptides
+    GLOBAL_DATA["predictor"] = predictor
+    GLOBAL_DATA["args"] = {
+        'motif_summary': args.motif_summary,
+        'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
+        'verbose': args.verbosity > 0,
+        'model_kwargs': {
+            'batch_size': args.prediction_batch_size,
+        }
+    }
+    del encoded_peptides

-    worker_pool = worker_pool_with_gpu_assignments_from_args(args)
-
-    if worker_pool is None:
+    serial_run = not args.cluster_parallelism and args.num_jobs == 0
+    worker_pool = None
+    start = time.time()
+    work_items = [{"allele": allele} for allele in alleles]
+    if serial_run:
        # Serial run
        print("Running in serial.")
        results = (
-            calibrate_percentile_ranks(
-                allele=allele,
-                predictor=predictor,
-                peptides=encoded_peptides)
-            for allele in alleles)
+            do_calibrate_percentile_ranks(**item) for item in work_items)
+    elif args.cluster_parallelism:
+        # Run using separate processes HPC cluster.
+        print("Running on cluster.")
+        results = cluster_results_from_args(
+            args,
+            work_function=do_calibrate_percentile_ranks,
+            work_items=work_items,
+            constant_data=GLOBAL_DATA,
+            result_serialization_method="pickle",
+            clear_constant_data=True)
    else:
-        # Parallel run
+        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
+        print("Worker pool", worker_pool)
+        assert worker_pool is not None
        results = worker_pool.imap_unordered(
-            partial(
-                partial(call_wrapped, calibrate_percentile_ranks),
-                predictor=predictor),
-            alleles,
+            partial(call_wrapped_kwargs, do_calibrate_percentile_ranks),
+            work_items,
            chunksize=1)

-    for result in tqdm.tqdm(results, total=len(alleles)):
-        predictor.allele_to_percent_rank_transform.update(result)
-    print("Done calibrating %d additional alleles." % len(alleles))
+    summary_results_lists = collections.defaultdict(list)
+    for (transforms, summary_results) in tqdm.tqdm(results, total=len(work_items)):
+        predictor.allele_to_percent_rank_transform.update(transforms)
+        if summary_results is not None:
+            for (item, value) in summary_results.items():
+                summary_results_lists[item].append(value)
+    print("Done calibrating %d alleles." % len(work_items))
+    if summary_results_lists:
+        for (name, lst) in summary_results_lists.items():
+            df = pandas.concat(lst, ignore_index=True)
+            predictor.metadata_dataframes[name] = df
+            print("Including summary result: %s" % name)
+            print(df)
+
    predictor.save(args.models_dir, model_names_to_write=[])

    percent_rank_calibration_time = time.time() - start
@@ -133,23 +212,42 @@ def run(argv=sys.argv[1:]):
        worker_pool.join()

    print("Percent rank calibration time: %0.2f min." % (
-       percent_rank_calibration_time / 60.0))
+        percent_rank_calibration_time / 60.0))
    print("Predictor written to: %s" % args.models_dir)


-def calibrate_percentile_ranks(allele, predictor, peptides=None):
-    """
-    Private helper function.
-    """
-    global GLOBAL_DATA
-    if peptides is None:
-        peptides = GLOBAL_DATA["calibration_peptides"]
-    predictor.calibrate_percentile_ranks(
+def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA):
+    return calibrate_percentile_ranks(
+        allele,
+        constant_data['predictor'],
+        peptides=constant_data['calibration_peptides'],
+        **constant_data["args"])
+
+
+def calibrate_percentile_ranks(
+        allele,
+        predictor,
+        peptides=None,
+        motif_summary=False,
+        summary_top_peptide_fractions=[0.001],
+        verbose=False,
+        model_kwargs={}):
+    if verbose:
+        print("Calibrating", allele)
+    start = time.time()
+    summary_results = predictor.calibrate_percentile_ranks(
        peptides=peptides,
-        alleles=[allele])
-    return {
+        alleles=[allele],
+        motif_summary=motif_summary,
+        summary_top_peptide_fractions=summary_top_peptide_fractions,
+        verbose=verbose,
+        model_kwargs=model_kwargs)
+    if verbose:
+        print("Done calibrating", allele, "in", time.time() - start, "sec")
+    transforms = {
        allele: predictor.allele_to_percent_rank_transform[allele],
    }
+    return (transforms, summary_results)


 if __name__ == '__main__':

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
--- a/mhcflurry/cluster_parallelism.py
+++ b/mhcflurry/cluster_parallelism.py
+"""
+Simple, relatively naive parallel map implementation for HPC clusters.
+
+Used for training MHCflurry models.
+"""
+import traceback
+import sys
+import os
+import time
+import signal
+import argparse
+import pickle
+import subprocess
+import shutil
+
+from .local_parallelism import call_wrapped_kwargs
+from .class1_affinity_predictor import Class1AffinityPredictor
+
+try:
+    from shlex import quote
+except ImportError:
+    from pipes import quote
+
+
+def add_cluster_parallelism_args(parser):
+    """
+    Add commandline arguments controlling cluster parallelism to an argparse
+    ArgumentParser.
+
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+    """
+    group = parser.add_argument_group("Cluster parallelism")
+    group.add_argument(
+        "--cluster-parallelism",
+        default=False,
+        action="store_true")
+    group.add_argument(
+        "--cluster-submit-command",
+        default='sh',
+        help="Default: %(default)s")
+    group.add_argument(
+        "--cluster-results-workdir",
+        default='./cluster-workdir',
+        help="Default: %(default)s")
+    group.add_argument(
+        '--cluster-script-prefix-path',
+        help="",
+    )
+    group.add_argument('--cluster-max-retries', help="", default=3)
+
+
+def cluster_results_from_args(
+        args,
+        work_function,
+        work_items,
+        constant_data=None,
+        result_serialization_method="pickle",
+        clear_constant_data=False):
+    """
+    Parallel map configurable using commandline arguments. See the
+    cluster_results() function for docs.
+
+    The `args` parameter should be an argparse.Namespace from an argparse parser
+    generated using the add_cluster_parallelism_args() function.
+
+
+    Parameters
+    ----------
+    args
+    work_function
+    work_items
+    constant_data
+    result_serialization_method
+    clear_constant_data
+
+    Returns
+    -------
+    generator
+    """
+    return cluster_results(
+        work_function=work_function,
+        work_items=work_items,
+        constant_data=constant_data,
+        submit_command=args.cluster_submit_command,
+        results_workdir=args.cluster_results_workdir,
+        script_prefix_path=args.cluster_script_prefix_path,
+        result_serialization_method=result_serialization_method,
+        clear_constant_data=clear_constant_data
+    )
+
+
+def cluster_results(
+        work_function,
+        work_items,
+        constant_data=None,
+        submit_command="sh",
+        results_workdir="./cluster-workdir",
+        script_prefix_path=None,
+        result_serialization_method="pickle",
+        max_retries=3,
+        clear_constant_data=False):
+    """
+    Parallel map on an HPC cluster.
+
+    Returns [work_function(item) for item in work_items] where each invocation
+    of work_function is performed as a separate HPC cluster job. Order is
+    preserved.
+
+    Optionally, "constant data" can be specified, which will be passed to
+    each work_function() invocation as a keyword argument called constant_data.
+    This data is serialized once and all workers read it from the same source,
+    which is more efficient than serializing it separately for each worker.
+
+    Each worker's input is serialized to a shared NFS directory and the
+    submit_command is used to launch a job to process that input. The shared
+    filesystem is polled occasionally to watch for results, which are fed back
+    to the user.
+
+    Parameters
+    ----------
+    work_function : A -> B
+    work_items : list of A
+    constant_data : object
+    submit_command : string
+        For running on LSF, we use "bsub" here.
+    results_workdir : string
+        Path to NFS shared directory where inputs and results can be written
+    script_prefix_path : string
+        Path to script that will be invoked to run each worker. A line calling
+        the _mhcflurry-cluster-worker-entry-point command will be appended to
+        the contents of this file.
+    result_serialization_method : string, one of "pickle" or "save_predictor"
+        The "save_predictor" works only when the return type of work_function
+        is Class1AffinityPredictor
+    max_retries : int
+        How many times to attempt to re-launch a failed worker
+    clear_constant_data : bool
+        If True, the constant data dict is cleared on the launching host after
+        it is serialized to disk.
+
+    Returns
+    -------
+    generator of B
+    """
+
+    constant_payload = {
+        'constant_data': constant_data,
+        'function': work_function,
+    }
+    work_dir = os.path.join(
+        os.path.abspath(results_workdir),
+        str(int(time.time())))
+    os.mkdir(work_dir)
+
+    constant_payload_path = os.path.join(work_dir, "global_data.pkl")
+    with open(constant_payload_path, "wb") as fd:
+        pickle.dump(constant_payload, fd, protocol=pickle.HIGHEST_PROTOCOL)
+    print("Wrote:", constant_payload_path)
+    if clear_constant_data:
+        constant_data.clear()
+        print("Cleared constant data to free up memory.")
+
+    if script_prefix_path:
+        with open(script_prefix_path) as fd:
+            script_prefix = fd.read()
+    else:
+        script_prefix = "#!/bin/bash"
+
+    result_items = []
+
+    for (i, item) in enumerate(work_items):
+        item_workdir = os.path.join(
+            work_dir, "work-item.%03d-of-%03d" % (i, len(work_items)))
+        os.mkdir(item_workdir)
+
+        item_data_path = os.path.join(item_workdir, "data.pkl")
+        with open(item_data_path, "wb") as fd:
+            pickle.dump(item, fd, protocol=pickle.HIGHEST_PROTOCOL)
+        print("Wrote:", item_data_path)
+
+        item_result_path = os.path.join(item_workdir, "result")
+        item_error_path = os.path.join(item_workdir, "error.pkl")
+        item_finished_path = os.path.join(item_workdir, "COMPLETE")
+
+        item_script_pieces = [
+            script_prefix.format(work_item_num=i, work_dir=item_workdir)
+        ]
+        item_script_pieces.append(" ".join([
+            "_mhcflurry-cluster-worker-entry-point",
+            "--constant-data", quote(constant_payload_path),
+            "--worker-data", quote(item_data_path),
+            "--result-out", quote(item_result_path),
+            "--error-out", quote(item_error_path),
+            "--complete-dir", quote(item_finished_path),
+            "--result-serialization-method", result_serialization_method,
+        ]))
+        item_script = "\n".join(item_script_pieces)
+        item_script_path = os.path.join(
+            item_workdir,
+            "run.%d.sh" % i)
+        with open(item_script_path, "w") as fd:
+            fd.write(item_script)
+        print("Wrote:", item_script_path)
+
+        launch_command = " ".join([
+            submit_command, "<", quote(item_script_path)
+        ])
+        subprocess.check_call(launch_command, shell=True)
+        print("Invoked", launch_command)
+
+        result_items.append({
+            'work_dir': item_workdir,
+            'finished_path': item_finished_path,
+            'result_path': item_result_path,
+            'error_path': item_error_path,
+            'retry_num': 0,
+            'launch_command': launch_command,
+        })
+
+    def result_generator():
+        start = time.time()
+        while result_items:
+            print("[%0.1f sec elapsed] waiting on %d / %d items." % (
+                time.time() - start, len(result_items), len(work_items)))
+            while True:
+                result_item = None
+                for d in result_items:
+                    if os.path.exists(d['finished_path']):
+                        result_item = d
+                        break
+                if result_item is None:
+                    time.sleep(60)
+                else:
+                    result_items.remove(result_item)
+                    break
+
+            complete_dir = result_item['finished_path']
+            result_path = result_item['result_path']
+            error_path = result_item['error_path']
+            retry_num = result_item['retry_num']
+            launch_command = result_item['launch_command']
+
+            print("[%0.1f sec elapsed] processing item %s" % (
+                time.time() - start, result_item))
+
+            if os.path.exists(error_path):
+                print("Error path exists", error_path)
+                with open(error_path, "rb") as fd:
+                    exception = pickle.load(fd)
+                    print(exception)
+                    if retry_num < max_retries:
+                        print("Relaunching", launch_command)
+                        attempt_dir = os.path.join(
+                            result_item['work_dir'], "attempt.%d" % retry_num)
+                        shutil.move(complete_dir, attempt_dir)
+                        shutil.move(error_path, attempt_dir)
+                        subprocess.check_call(launch_command, shell=True)
+                        print("Invoked", launch_command)
+                        result_item['retry_num'] += 1
+                        result_items.append(result_item)
+                        continue
+                    else:
+                        print("Max retries exceeded", max_retries)
+                        raise exception
+
+            if os.path.exists(result_path):
+                print("Result path exists", result_path)
+                if result_serialization_method == "save_predictor":
+                    result = Class1AffinityPredictor.load(result_path)
+                else:
+                    assert result_serialization_method == "pickle"
+                    with open(result_path, "rb") as fd:
+                        result = pickle.load(fd)
+                yield result
+            else:
+                raise RuntimeError("Results do not exist", result_path)
+
+    return result_generator()
+
+
+parser = argparse.ArgumentParser(
+    usage="Entry point for cluster workers")
+parser.add_argument(
+    "--constant-data",
+    required=True,
+)
+parser.add_argument(
+    "--worker-data",
+    required=True,
+)
+parser.add_argument(
+    "--result-out",
+    required=True,
+)
+parser.add_argument(
+    "--error-out",
+    required=True,
+)
+parser.add_argument(
+    "--complete-dir",
+)
+parser.add_argument(
+    "--result-serialization-method",
+    choices=("pickle", "save_predictor"),
+    default="pickle")
+
+
+def worker_entry_point(argv=sys.argv[1:]):
+    """
+    Entry point for the worker command.
+
+    Parameters
+    ----------
+    argv : list of string
+    """
+    # On sigusr1 print stack trace
+    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
+    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
+
+    args = parser.parse_args(argv)
+
+    with open(args.constant_data, "rb") as fd:
+        constant_payload = pickle.load(fd)
+
+    with open(args.worker_data, "rb") as fd:
+        worker_data = pickle.load(fd)
+
+    kwargs = dict(worker_data)
+    if constant_payload['constant_data'] is not None:
+        kwargs['constant_data'] = constant_payload['constant_data']
+
+    try:
+        result = call_wrapped_kwargs(constant_payload['function'], kwargs)
+        if args.result_serialization_method == 'save_predictor':
+            result.save(args.result_out)
+        else:
+            with open(args.result_out, "wb") as fd:
+                pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
+        print("Wrote:", args.result_out)
+    except Exception as e:
+        print("Exception: ", e)
+        with open(args.error_out, "wb") as fd:
+            pickle.dump(e, fd, pickle.HIGHEST_PROTOCOL)
+        print("Wrote:", args.error_out)
+        raise
+    finally:
+        if args.complete_dir:
+            os.mkdir(args.complete_dir)
+            print("Created: ", args.complete_dir)
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -147,3 +147,30 @@ def random_peptides(num, length=9, distribution=None):
            p=distribution.values,
            size=(int(num), int(length)))
    ]
+
+
+def positional_frequency_matrix(peptides):
+    """
+    Given a set of peptides, calculate a length x amino acids frequency matrix.
+
+    Parameters
+    ----------
+    peptides : list of string
+        All of same length
+
+    Returns
+    -------
+    pandas.DataFrame
+        Index is position, columns are amino acids
+    """
+    length = len(peptides[0])
+    assert all(len(peptide) == length for peptide in peptides)
+    counts = pandas.DataFrame(
+        index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'],
+        columns=numpy.arange(1, length + 1),
+    )
+    for i in range(length):
+        counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
+    result = (counts / len(peptides)).fillna(0.0).T
+    result.index.name = 'position'
+    return result
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -5,16 +5,78 @@ For losses supporting inequalities, each training data point is associated with
 one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only
 if the prediction is less than the given value.
 """
-
+from __future__ import division
 import pandas
+import numpy
 from numpy import isnan, array

 CUSTOM_LOSSES = {}


-class MSEWithInequalities(object):
+def get_loss(name):
+    """
+    Get a custom_loss.Loss instance by name.
+
+    Parameters
+    ----------
+    name : string
+
+    Returns
+    -------
+    custom_loss.Loss
+    """
+    if name.startswith("custom:"):
+        try:
+            custom_loss = CUSTOM_LOSSES[name.replace("custom:", "")]
+        except KeyError:
+            raise ValueError(
+                "No such custom loss: %s. Supported losses are: %s" % (
+                    name,
+                    ", ".join([
+                        "custom:" + loss_name for loss_name in CUSTOM_LOSSES
+                    ])))
+        return custom_loss
+    return StandardKerasLoss(name)
+
+
+class Loss(object):
+    """
+    Thin wrapper to keep track of neural network loss functions, which could
+    be custom or baked into Keras.
+
+    Each subclass or instance should define these properties/methods:
+    - name : string
+    - loss : string or function
+        This is what gets passed to keras.fit()
+    - encode_y : numpy.ndarray -> numpy.ndarray
+        Transformation to apply to regression target before fitting
+    """
+    def __init__(self, name=None):
+        self.name = name if name else self.name  # use name from class instance
+
+    def __str__(self):
+        return "<Loss: %s>" % self.name
+
+
+class StandardKerasLoss(Loss):
+    """
+    A loss function supported by Keras, such as MSE.
    """
-    Supports training a regressor on data that includes inequalities
+    supports_inequalities = False
+    supports_multiple_outputs = False
+
+    def __init__(self, loss_name="mse"):
+        self.loss = loss_name
+        Loss.__init__(self, loss_name)
+
+    @staticmethod
+    def encode_y(y):
+        return y
+
+
+class MSEWithInequalities(Loss):
+    """
+    Supports training a regression model on data that includes inequalities
    (e.g. x < 100). Mean square error is used as the loss for elements with
    an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss
    for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise.
@@ -26,29 +88,30 @@ class MSEWithInequalities(object):
    y_true is interpreted as follows:

    between 0 - 1
-       Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
+       Regular MSE loss is used. Penalty (y_pred - y_true)**2 is applied if
       y_pred is greater or less than y_true.

    between 2 - 3:
-       Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
+       Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
       applied only if y_pred is greater than y_true - 2.

    between 4 - 5:
-       Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
+       Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
       applied only if y_pred is less than y_true - 4.
    """
    name = "mse_with_inequalities"
    supports_inequalities = True
+    supports_multiple_outputs = False

    @staticmethod
    def encode_y(y, inequalities=None):
        y = array(y, dtype="float32")
        if isnan(y).any():
-            raise ValueError("y contains NaN")
+            raise ValueError("y contains NaN", y)
        if (y > 1.0).any():
-            raise ValueError("y contains values > 1.0")
+            raise ValueError("y contains values > 1.0", y)
        if (y < 0.0).any():
-            raise ValueError("y contains values < 0.0")
+            raise ValueError("y contains values < 0.0", y)

        if inequalities is None:
            encoded = y
@@ -66,8 +129,8 @@ class MSEWithInequalities(object):

    @staticmethod
    def loss(y_true, y_pred):
-        # We always delay import of Keras so that mhcflurry can be imported initially
-        # without tensorflow debug output, etc.
+        # We always delay import of Keras so that mhcflurry can be imported
+        # initially without tensorflow debug output, etc.
        from keras import backend as K

        # Handle (=) inequalities
@@ -86,11 +149,106 @@ class MSEWithInequalities(object):
        diff3 *= K.cast(y_true >= 4.0, "float32")
        diff3 *= K.cast(diff3 > 0.0, "float32")

-        return (
-            K.sum(K.square(diff1), axis=-1) +
-            K.sum(K.square(diff2), axis=-1) +
-            K.sum(K.square(diff3), axis=-1))
+        result = (
+            K.sum(K.square(diff1)) +
+            K.sum(K.square(diff2)) +
+            K.sum(K.square(diff3))) / K.cast(K.shape(y_pred)[0], "float32")
+
+        return result
+
+
+class MSEWithInequalitiesAndMultipleOutputs(Loss):
+    """
+    Loss supporting inequalities and multiple outputs.
+
+    This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
+    hack, the implementation uses other intervals for y_pred to encode the
+    inequality and output-index information.
+
+    Inequalities are encoded into the regression target as in
+    the MSEWithInequalities loss.
+
+    Multiple outputs are encoded by mapping each regression target x (after
+    transforming for inequalities) using the rule x -> x + i * 10 where i is
+    the output index.
+
+    The reason for explicitly encoding multiple outputs this way (rather than
+    just making the regression target a matrix instead of a vector) is that
+    in our use cases we frequently have missing data in the regression target.
+    This encoding gives a simple way to penalize only on (data point, output
+    index) pairs that have labels.
+    """
+    name = "mse_with_inequalities_and_multiple_outputs"
+    supports_inequalities = True
+    supports_multiple_outputs = True
+
+    @staticmethod
+    def encode_y(y, inequalities=None, output_indices=None):
+        y = array(y, dtype="float32")
+        if isnan(y).any():
+            raise ValueError("y contains NaN", y)
+        if (y > 1.0).any():
+            raise ValueError("y contains values > 1.0", y)
+        if (y < 0.0).any():
+            raise ValueError("y contains values < 0.0", y)
+
+        encoded = MSEWithInequalities.encode_y(
+            y, inequalities=inequalities)
+
+        if output_indices is not None:
+            output_indices = numpy.array(output_indices)
+            check_shape("output_indices", output_indices, (len(encoded),))
+            if (output_indices < 0).any():
+                raise ValueError("Invalid output indices: ", output_indices)
+
+            encoded += output_indices * 10
+
+        return encoded
+
+    @staticmethod
+    def loss(y_true, y_pred):
+        from keras import backend as K
+
+        y_true = K.flatten(y_true)
+
+        output_indices = y_true // 10
+        updated_y_true = y_true - (10 * output_indices)
+
+        # We index into y_pred using flattened indices since Keras backend
+        # supports gather but has no equivalent of tf.gather_nd:
+        ordinals = K.arange(K.shape(y_true)[0])
+        flattened_indices = (
+            ordinals * y_pred.shape[1] + K.cast(output_indices, "int32"))
+        updated_y_pred = K.gather(K.flatten(y_pred), flattened_indices)
+
+        # Alternative implementation using tensorflow, which could be used if
+        # we drop support for other backends:
+        # import tensorflow as tf
+        # indexer = K.stack([
+        #     ordinals,
+        #     K.cast(output_indices, "int32")
+        # ], axis=-1)
+        #updated_y_pred = tf.gather_nd(y_pred, indexer)
+
+        return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
+
+
+def check_shape(name, arr, expected_shape):
+    """
+    Raise ValueError if arr.shape != expected_shape.
+
+    Parameters
+    ----------
+    name : string
+        Included in error message to aid debugging
+    arr : numpy.ndarray
+    expected_shape : tuple of int
+    """
+    if arr.shape != expected_shape:
+        raise ValueError("Expected %s to have shape %s not %s" % (
+            name, str(expected_shape), str(arr.shape)))
+

 # Register custom losses.
-for cls in [MSEWithInequalities]:
-    CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
+    CUSTOM_LOSSES[cls.name] = cls()
--- a/mhcflurry/data_dependent_weights_initialization.py
+++ b/mhcflurry/data_dependent_weights_initialization.py
+"""
+Layer-sequential unit-variance initialization for neural networks.
+
+See:
+    Mishkin and Matas, "All you need is a good init". 2016.
+    https://arxiv.org/abs/1511.06422
+"""
+#
+# LSUV initialization code in this file is adapted from:
+#   https://github.com/ducha-aiki/LSUV-keras/blob/master/lsuv_init.py
+# by Dmytro Mishkin
+#
+# Here is the license for the original code:
+#
+#
+# Copyright (C) 2017, Dmytro Mishkin
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the
+#    distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import print_function
+import numpy
+
+
+def svd_orthonormal(shape):
+    # Orthonormal init code is from Lasagne
+    # https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py
+    if len(shape) < 2:
+        raise RuntimeError("Only shapes of length 2 or more are supported.")
+    flat_shape = (shape[0], numpy.prod(shape[1:]))
+    a = numpy.random.standard_normal(flat_shape).astype("float32")
+    u, _, v = numpy.linalg.svd(a, full_matrices=False)
+    q = u if u.shape == flat_shape else v
+    q = q.reshape(shape)
+    return q
+
+
+def get_activations(model, layer, X_batch):
+    from keras.models import Model
+    intermediate_layer_model = Model(
+        inputs=model.get_input_at(0),
+        outputs=layer.get_output_at(0)
+    )
+    activations = intermediate_layer_model.predict(X_batch)
+    return activations
+
+
+def lsuv_init(model, batch, verbose=True, margin=0.1, max_iter=100):
+    """
+    Initialize neural network weights using layer-sequential unit-variance
+    initialization.
+
+    See:
+        Mishkin and Matas, "All you need is a good init". 2016.
+        https://arxiv.org/abs/1511.06422
+
+    Parameters
+    ----------
+    model : keras.Model
+    batch : dict
+        Training data, as would be passed keras.Model.fit()
+    verbose : boolean
+        Whether to print progress to stdout
+    margin : float
+    max_iter : int
+
+    Returns
+    -------
+    keras.Model
+        Same as what was passed in.
+    """
+    from keras.layers import Dense, Convolution2D
+    needed_variance = 1.0
+    layers_inintialized = 0
+    for layer in model.layers:
+        if not isinstance(layer, (Dense, Convolution2D)):
+            continue
+        # avoid small layers where activation variance close to zero, esp.
+        # for small batches
+        if numpy.prod(layer.get_output_shape_at(0)[1:]) < 32:
+            if verbose:
+                print('LSUV initialization skipping', layer.name)
+            continue
+        layers_inintialized += 1
+        weights_and_biases = layer.get_weights()
+        weights_and_biases[0] = svd_orthonormal(weights_and_biases[0].shape)
+        layer.set_weights(weights_and_biases)
+        activations = get_activations(model, layer, batch)
+        variance = numpy.var(activations)
+        iteration = 0
+        if verbose:
+            print(layer.name, variance)
+        while abs(needed_variance - variance) > margin:
+            if verbose:
+                print(
+                    'LSUV initialization',
+                    layer.name,
+                    iteration,
+                    needed_variance,
+                    margin,
+                    variance)
+
+            if numpy.abs(numpy.sqrt(variance)) < 1e-7:
+                break  # avoid zero division
+
+            weights_and_biases = layer.get_weights()
+            weights_and_biases[0] /= numpy.sqrt(variance) / numpy.sqrt(
+                needed_variance)
+            layer.set_weights(weights_and_biases)
+            activations = get_activations(model, layer, batch)
+            variance = numpy.var(activations)
+
+            iteration += 1
+            if iteration >= max_iter:
+                break
+    if verbose:
+        print('Done with LSUV: total layers initialized', layers_inintialized)
+    return model
\ No newline at end of file
--- a/mhcflurry/downloads.py
+++ b/mhcflurry/downloads.py
@@ -9,9 +9,9 @@ from __future__ import (
 )
 import logging
 import yaml
-from os.path import join, exists, relpath
-from pipes import quote
+from os.path import join, exists
 from os import environ
+from pipes import quote
 from collections import OrderedDict
 from appdirs import user_data_dir
 from pkg_resources import resource_string
@@ -81,8 +81,7 @@ def get_default_class1_models_dir(test_exists=True):
        if test_exists and not exists(result):
            raise IOError("No such directory: %s" % result)
        return result
-    else:
-        return get_path("models_class1", "models", test_exists=test_exists)
+    return get_path("models_class1", "models", test_exists=test_exists)


 def get_current_release_downloads():
@@ -160,13 +159,13 @@ def configure():
            metadata["releases"][_CURRENT_RELEASE]["compatibility-version"])
        current_compatability = metadata["current-compatibility-version"]
        if current_release_compatability != current_compatability:
-            logging.warn(
+            logging.warning(
                "The specified downloads are not compatible with this version "
                "of the MHCflurry codebase. Downloads: release %s, "
-                "compatability version: %d. Code compatability version: %d" % (
-                    _CURRENT_RELEASE,
-                    current_release_compatability,
-                    current_compatability))
+                "compatability version: %d. Code compatability version: %d",
+                _CURRENT_RELEASE,
+                current_release_compatability,
+                current_compatability)

        data_dir = environ.get("MHCFLURRY_DATA_DIR")
        if not data_dir:
@@ -176,6 +175,7 @@ def configure():
            data_dir = user_data_dir("mhcflurry", version="4")
        _DOWNLOADS_DIR = join(data_dir, _CURRENT_RELEASE)

-    logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s" % _DOWNLOADS_DIR)
+    logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s", _DOWNLOADS_DIR)
+

 configure()
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -8,7 +8,7 @@
 # by name, the downloads with "default=true" are downloaded.

 # This should usually be the latest release.
-current-release: 1.2.0
+current-release: 1.3.0

 # An integer indicating what models the current MHCflurry code base is compatible
 # with. Increment this integer when changes are made to MHCflurry that would break
@@ -17,6 +17,70 @@ current-compatibility-version: 2

 # Add new releases here as they are made.
 releases:
+    1.3.0:
+        compatibility-version: 2
+        downloads:
+            - name: models_class1_pan
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan.20190829.tar.bz2
+              default: false
+
+            - name: models_class1_pan_unselected
+              part_urls:
+                - https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan_unselected.20190826.tar.bz2.part.aa
+                - https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan_unselected.20190826.tar.bz2.part.ab
+              default: false
+
+            - name: data_iedb
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_iedb.20190610.tar.bz2
+              default: false
+
+            - name: data_systemhcatlas
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_systemhcatlas.20190506.tar.bz2
+              default: false
+
+            - name: allele_sequences
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/allele_sequences.20190506.tar.bz2
+              default: false
+
+            - name: random_peptide_predictions
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/random_peptide_predictions.20190506.tar.bz2
+              default: false
+
+            - name: data_published
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
+              default: false
+
+            - name: data_curated
+              url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190516.tar.bz2
+              default: true
+
+            # Older downloads
+            - name: models_class1
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1.20180225.tar.bz2
+              default: true
+
+            - name: models_class1_selected_no_mass_spec
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_selected_no_mass_spec.20180225.tar.bz2
+              default: false
+
+            - name: models_class1_unselected
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_unselected.20180221.tar.bz2
+              default: false
+
+            - name: models_class1_trained_with_mass_spec
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_trained_with_mass_spec.20180228.tar.bz2
+              default: false
+
+            - name: models_class1_unselected_with_mass_spec
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_unselected_with_mass_spec.20180227.tar.bz2
+              default: false
+
+            - name: models_class1_minimal
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_minimal.20180226.tar.bz2
+              default: false
+
+
+
    1.2.0:
        compatibility-version: 2
        downloads:

--- a/mhcflurry/downloads_command.py
+++ b/mhcflurry/downloads_command.py
@@ -27,7 +27,8 @@ import os
 from pipes import quote
 import errno
 import tarfile
-from tempfile import mkstemp
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
 from tqdm import tqdm
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481

@@ -167,8 +168,7 @@ def fetch_subcommand(args):
                "\nThe requested download '%s' has already been downloaded. "
                "To re-download this data, first run: \n\t%s\nin a shell "
                "and then re-run this command.\n" +
-                "*" * 40)
-                % (name, 'rm -rf ' + quote(get_path(name))))
+                "*" * 40) % (name, 'rm -rf ' + quote(get_path(name))))
        if not info['downloaded'] and (name in args.download_name or default):
            items_to_fetch.add(name)

@@ -181,27 +181,46 @@ def fetch_subcommand(args):
        "DOWNLOAD NAME", "ALREADY DOWNLOADED?", "WILL DOWNLOAD NOW?", "URL"))

    for (item, info) in downloads.items():
+        urls = (
+            [info['metadata']["url"]]
+            if "url" in info['metadata']
+            else info['metadata']["part_urls"])
+        url_description = urls[0]
+        if len(urls) > 1:
+            url_description += " + %d more parts" % (len(urls) - 1)
+
        qprint(format_string % (
            item,
            yes_no(info['downloaded']),
            yes_no(item in items_to_fetch),
-            info['metadata']["url"]))
+            url_description))

    # TODO: may want to extract into somewhere temporary and then rename to
    # avoid making an incomplete extract if the process is killed.
    for item in items_to_fetch:
        metadata = downloads[item]['metadata']
-        (temp_fd, target_path) = mkstemp()
+        urls = (
+            [metadata["url"]] if "url" in metadata else metadata["part_urls"])
+        temp = NamedTemporaryFile(delete=False, suffix=".tar.bz2")
        try:
-            qprint("Downloading: %s" % metadata['url'])
-            urlretrieve(
-                metadata['url'],
-                target_path,
-                reporthook=TqdmUpTo(
-                    unit='B', unit_scale=True, miniters=1).update_to)
-            qprint("Downloaded to: %s" % quote(target_path))
-
-            tar = tarfile.open(target_path, 'r:bz2')
+            for (url_num, url) in enumerate(urls):
+                qprint("Downloading [part %d/%d]: %s" % (
+                    url_num + 1, len(urls), url))
+                (downloaded_path, _) = urlretrieve(
+                    url,
+                    temp.name if len(urls) == 1 else None,
+                    reporthook=TqdmUpTo(
+                        unit='B', unit_scale=True, miniters=1).update_to)
+                qprint("Downloaded to: %s" % quote(downloaded_path))
+
+                if downloaded_path != temp.name:
+                    qprint("Appending to: %s" % temp.name)
+                    with open(downloaded_path, "rb") as fd:
+                        copyfileobj(fd, temp, length=64*1024*1024)
+                    os.remove(downloaded_path)
+
+            temp.close()
+            tar = tarfile.open(temp.name, 'r:bz2')
            names = tar.getnames()
            logging.debug("Extracting: %s" % names)
            bad_names = [
@@ -221,7 +240,7 @@ def fetch_subcommand(args):
                len(names), quote(result_dir)))
        finally:
            if not args.keep:
-                os.remove(target_path)
+                os.remove(temp.name)


 def info_subcommand(args):
@@ -257,10 +276,18 @@ def info_subcommand(args):
    print(format_string % ("DOWNLOAD NAME", "DOWNLOADED?", "URL"))

    for (item, info) in downloads.items():
+        urls = (
+            [info['metadata']["url"]]
+            if "url" in info['metadata']
+            else info['metadata']["part_urls"])
+        url_description = urls[0]
+        if len(urls) > 1:
+            url_description += " + %d more parts" % (len(urls) - 1)
+
        print(format_string % (
            item,
            yes_no(info['downloaded']),
-            info['metadata']["url"]))
+            url_description))


 def path_subcommand(args):

--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
--- a/mhcflurry/ensemble_centrality.py
+++ b/mhcflurry/ensemble_centrality.py
@@ -37,4 +37,4 @@ CENTRALITY_MEASURES = {
    "mean": partial(numpy.nanmean, axis=1),
    "median": partial(numpy.nanmedian, axis=1),
    "robust_mean": robust_mean,
-}
\ No newline at end of file
+}
--- a/mhcflurry/hyperparameters.py
+++ b/mhcflurry/hyperparameters.py
+"""
+Hyperparameter (neural network options) management
+"""
 from __future__ import (
    print_function,
    division,
@@ -70,8 +73,7 @@ class HyperparameterDefaults(object):
        if invalid_keys:
            raise ValueError(
                "No such model parameters: %s. Valid parameters are: %s"
-                % (" ".join(invalid_keys),
-                    " ".join(self.defaults)))
+                % (" ".join(invalid_keys), " ".join(self.defaults)))

    def models_grid(self, **kwargs):
        '''

--- a/mhcflurry/parallelism.py
+++ b/mhcflurry/parallelism.py