Deleted mhcflurry/__init__.py, mhcflurry/allele_encoding.py,...

Deleted mhcflurry/__init__.py, mhcflurry/allele_encoding.py, mhcflurry/amino_acid.py, mhcflurry/calibrate_percentile_ranks_command.py, mhcflurry/class1_affinity_predictor.py, mhcflurry/class1_neural_network.py, mhcflurry/class1_presentation_predictor.py, mhcflurry/class1_processing_neural_network.py, mhcflurry/class1_processing_predictor.py, mhcflurry/cluster_parallelism.py, mhcflurry/common.py, mhcflurry/custom_loss.py, mhcflurry/data_dependent_weights_initialization.py, mhcflurry/downloads.py, mhcflurry/downloads.yml, mhcflurry/downloads_command.py, mhcflurry/encodable_sequences.py, mhcflurry/ensemble_centrality.py, mhcflurry/fasta.py, mhcflurry/flanking_encoding.py, mhcflurry/hyperparameters.py, mhcflurry/local_parallelism.py, mhcflurry/percent_rank_transform.py, mhcflurry/predict_command.py, mhcflurry/predict_scan_command.py, mhcflurry/random_negative_peptides.py, mhcflurry/regression_target.py, mhcflurry/scoring.py, mhcflurry/select_allele_specific_models_command.py, mhcflurry/select_pan_allele_models_command.py, mhcflurry/select_processing_models_command.py, mhcflurry/testing_utils.py, mhcflurry/train_allele_specific_models_command.py, mhcflurry/train_pan_allele_models_command.py, mhcflurry/train_presentation_models_command.py, mhcflurry/train_processing_models_command.py, mhcflurry/version.py, train_APmodels.sh files

Deleted mhcflurry/init.py, mhcflurry/allele_encoding.py,...
c335946f · Patrick Skillman-Lawrence · a921e8eb · a921e8eb · a921e8eb · a921e8eb
Commit c335946f authored 4 years ago by Patrick Skillman-Lawrence
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
-"""
-Class I MHC ligand prediction package
-"""
-
-from .class1_affinity_predictor import Class1AffinityPredictor
-from .class1_neural_network import Class1NeuralNetwork
-from .class1_processing_predictor import Class1ProcessingPredictor
-from .class1_processing_neural_network import Class1ProcessingNeuralNetwork
-from .class1_presentation_predictor import Class1PresentationPredictor
-
-from .version import __version__
-
-__all__ = [
-    "__version__",
-    "Class1AffinityPredictor",
-    "Class1NeuralNetwork",
-    "Class1ProcessingPredictor",
-    "Class1ProcessingNeuralNetwork",
-    "Class1PresentationPredictor",
-]
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
-import pandas
-
-from . import amino_acid
-
-
-class AlleleEncoding(object):
-    def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None):
-        """
-        A place to cache encodings for a sequence of alleles.
-
-        We frequently work with alleles by integer indices, for example as
-        inputs to neural networks. This class is used to map allele names to
-        integer indices in a consistent way by keeping track of the universe
-        of alleles under use, i.e. a distinction is made between the universe
-        of supported alleles (what's in `allele_to_sequence`) and the actual
-        set of alleles used for some task (what's in `alleles`).
-
-        Parameters
-        ----------
-        alleles : list of string
-            Allele names. If any allele is None instead of string, it will be
-            mapped to the special index value -1.
-
-        allele_to_sequence : dict of str -> str
-            Allele name to amino acid sequence
-
-        borrow_from : AlleleEncoding, optional
-            If specified, do not specify allele_to_sequence. The sequences from
-            the provided instance are used. This guarantees that the mappings
-            from allele to index and from allele to sequence are the same
-            between the instances.
-        """
-
-        if alleles is not None:
-            alleles = pandas.Series(alleles)
-        self.borrow_from = borrow_from
-        self.allele_to_sequence = allele_to_sequence
-
-        if self.borrow_from is None:
-            assert allele_to_sequence is not None
-            all_alleles = (
-                sorted(allele_to_sequence))
-            self.allele_to_index = dict(
-                (allele, i)
-                for (i, allele) in enumerate([None] + all_alleles))
-            unpadded = pandas.Series([
-                    allele_to_sequence[a] if a is not None else ""
-                    for a in [None] + all_alleles
-                ],
-                index=[None] + all_alleles)
-            self.sequences = unpadded.str.pad(
-                unpadded.str.len().max(), fillchar="X")
-        else:
-            assert allele_to_sequence is None
-            self.allele_to_index = borrow_from.allele_to_index
-            self.sequences = borrow_from.sequences
-            self.allele_to_sequence = borrow_from.allele_to_sequence
-
-        if alleles is not None:
-            assert all(
-                allele in self.allele_to_index for allele in alleles),\
-                "Missing alleles: " + " ".join(set(
-                    a for a in alleles if a not in self.allele_to_index))
-            self.indices = alleles.map(self.allele_to_index)
-            assert not self.indices.isnull().any()
-            self.alleles = alleles
-        else:
-            self.indices = None
-            self.alleles = None
-
-        self.encoding_cache = {}
-
-    def compact(self):
-        """
-        Return a new AlleleEncoding in which the universe of supported alleles
-        is only the alleles actually used.
-
-        Returns
-        -------
-        AlleleEncoding
-        """
-        return AlleleEncoding(
-            alleles=self.alleles,
-            allele_to_sequence=dict(
-                (allele, self.allele_to_sequence[allele])
-                for allele in self.alleles.unique()
-                if allele is not None))
-
-    def allele_representations(self, encoding_name):
-        """
-        Encode the universe of supported allele sequences to a matrix.
-
-        Parameters
-        ----------
-        encoding_name : string
-            How to represent amino acids. Valid names are "BLOSUM62" or
-            "one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
-
-        Returns
-        -------
-        numpy.array of shape
-            (num alleles in universe, sequence length, vector size)
-        where vector size is usually 21 (20 amino acids + X character)
-        """
-        if self.borrow_from is not None:
-            return self.borrow_from.allele_representations(encoding_name)
-
-        cache_key = (
-            "allele_representations",
-            encoding_name)
-        if cache_key not in self.encoding_cache:
-            index_encoded_matrix = amino_acid.index_encoding(
-                self.sequences.values,
-                amino_acid.AMINO_ACID_INDEX)
-            vector_encoded = amino_acid.fixed_vectors_encoding(
-                index_encoded_matrix,
-                amino_acid.ENCODING_DATA_FRAMES[encoding_name])
-            self.encoding_cache[cache_key] = vector_encoded
-        return self.encoding_cache[cache_key]
-
-    def fixed_length_vector_encoded_sequences(self, encoding_name):
-        """
-        Encode allele sequences (not the universe of alleles) to a matrix.
-
-        Parameters
-        ----------
-        encoding_name : string
-            How to represent amino acids. Valid names are "BLOSUM62" or
-            "one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
-
-        Returns
-        -------
-        numpy.array with shape:
-            (num alleles, sequence length, vector size)
-        where vector size is usually 21 (20 amino acids + X character)
-        """
-        cache_key = (
-            "fixed_length_vector_encoding",
-            encoding_name)
-        if cache_key not in self.encoding_cache:
-            vector_encoded = self.allele_representations(encoding_name)
-            result = vector_encoded[self.indices]
-            self.encoding_cache[cache_key] = result
-        return self.encoding_cache[cache_key]
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
-"""
-Functions for encoding fixed length sequences of amino acids into various
-vector representations, such as one-hot and BLOSUM62.
-"""
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-import collections
-from copy import copy
-
-import pandas
-from six import StringIO
-
-
-COMMON_AMINO_ACIDS = collections.OrderedDict(sorted({
-    "A": "Alanine",
-    "R": "Arginine",
-    "N": "Asparagine",
-    "D": "Aspartic Acid",
-    "C": "Cysteine",
-    "E": "Glutamic Acid",
-    "Q": "Glutamine",
-    "G": "Glycine",
-    "H": "Histidine",
-    "I": "Isoleucine",
-    "L": "Leucine",
-    "K": "Lysine",
-    "M": "Methionine",
-    "F": "Phenylalanine",
-    "P": "Proline",
-    "S": "Serine",
-    "T": "Threonine",
-    "W": "Tryptophan",
-    "Y": "Tyrosine",
-    "V": "Valine",
-}.items()))
-COMMON_AMINO_ACIDS_WITH_UNKNOWN = copy(COMMON_AMINO_ACIDS)
-COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
-
-AMINO_ACID_INDEX = dict(
-    (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
-
-for (letter, i) in list(AMINO_ACID_INDEX.items()):
-    AMINO_ACID_INDEX[letter.lower()] = i  # Support lower-case as well.
-
-AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
-
-BLOSUM62_MATRIX = pandas.read_csv(StringIO("""
-   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
-A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0
-R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0
-N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0
-D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0
-C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0
-Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0
-E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0
-G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0
-H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0
-I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0
-L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1  0
-K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0
-M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1  0
-F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1  0
-P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2  0
-S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0
-T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0  0 
-W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
-Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
-V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
-X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
-"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS].astype("int8")
-assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
-
-ENCODING_DATA_FRAMES = {
-    "BLOSUM62": BLOSUM62_MATRIX,
-    "one-hot": pandas.DataFrame([
-        [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
-        for j in range(len(AMINO_ACIDS))
-    ], index=AMINO_ACIDS, columns=AMINO_ACIDS)
-}
-
-
-def available_vector_encodings():
-    """
-    Return list of supported amino acid vector encodings.
-
-    Returns
-    -------
-    list of string
-
-    """
-    return list(ENCODING_DATA_FRAMES)
-
-
-def vector_encoding_length(name):
-    """
-    Return the length of the given vector encoding.
-
-    Parameters
-    ----------
-    name : string
-
-    Returns
-    -------
-    int
-    """
-    return ENCODING_DATA_FRAMES[name].shape[1]
-
-
-def index_encoding(sequences, letter_to_index_dict):
-    """
-    Encode a sequence of same-length strings to a matrix of integers of the
-    same shape. The map from characters to integers is given by
-    `letter_to_index_dict`.
-
-    Given a sequence of `n` strings all of length `k`, return a `k * n` array where
-    the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`.
-
-    Parameters
-    ----------
-    sequences : list of length n of strings of length k
-    letter_to_index_dict : dict : string -> int
-
-    Returns
-    -------
-    numpy.array of integers with shape (`k`, `n`)
-    """
-    df = pandas.DataFrame(iter(s) for s in sequences)
-    result = df.replace(letter_to_index_dict)
-    return result.values
-
-
-def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
-    """
-    Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and
-    a dataframe mapping each index to an arbitrary vector, return a `n * k * m`
-    array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`.
-
-    The dataframe index and columns names are ignored here; the indexing is done
-    entirely by integer position in the dataframe.
-
-    Parameters
-    ----------
-    index_encoded_sequences : `n` x `k` array of integers
-
-    letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`)
-
-    Returns
-    -------
-    numpy.array of integers with shape (`n`, `k`, `m`)
-    """
-    (num_sequences, sequence_length) = index_encoded_sequences.shape
-    target_shape = (
-        num_sequences, sequence_length, letter_to_vector_df.shape[0])
-    result = letter_to_vector_df.iloc[
-        index_encoded_sequences.reshape((-1,))  # reshape() avoids copy
-    ].values.reshape(target_shape)
-    return result
-
--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
-"""
-Calibrate percentile ranks for models. Runs in-place.
-"""
-import argparse
-import os
-import signal
-import sys
-import time
-import traceback
-import collections
-from functools import partial
-
-import pandas
-import numpy
-
-from mhcnames import normalize_allele_name
-import tqdm  # progress bar
-tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
-
-from .class1_affinity_predictor import Class1AffinityPredictor
-from .class1_presentation_predictor import Class1PresentationPredictor
-from .encodable_sequences import EncodableSequences
-from .common import configure_logging, random_peptides, amino_acid_distribution
-from .local_parallelism import (
-    add_local_parallelism_args,
-    worker_pool_with_gpu_assignments_from_args,
-    call_wrapped_kwargs)
-from .cluster_parallelism import (
-    add_cluster_parallelism_args,
-    cluster_results_from_args)
-
-
-# To avoid pickling large matrices to send to child processes when running in
-# parallel, we use this global variable as a place to store data. Data that is
-# stored here before creating the thread pool will be inherited to the child
-# processes upon fork() call, allowing us to share large data with the workers
-# via shared memory.
-GLOBAL_DATA = {}
-
-parser = argparse.ArgumentParser(usage=__doc__)
-parser.add_argument(
-    "--predictor-kind",
-    choices=("class1_affinity", "class1_presentation"),
-    default="class1_affinity",
-    help="Type of predictor to calibrate")
-parser.add_argument(
-    "--models-dir",
-    metavar="DIR",
-    required=True,
-    help="Directory to read and write models")
-parser.add_argument(
-    "--allele",
-    default=None,
-    nargs="+",
-    help="Alleles to calibrate percentile ranks for. If not specified all "
-    "alleles are used")
-parser.add_argument(
-    "--match-amino-acid-distribution-data",
-    help="Sample random peptides from the amino acid distribution of the "
-    "peptides listed in the supplied CSV file, which must have a 'peptide' "
-    "column. If not specified a uniform distribution is used.")
-parser.add_argument(
-    "--alleles-file",
-    default=None,
-    help="Use alleles in supplied CSV file, which must have an 'allele' column.")
-parser.add_argument(
-    "--num-peptides-per-length",
-    type=int,
-    metavar="N",
-    default=int(1e5),
-    help="Number of peptides per length to use to calibrate percent ranks. "
-    "Default: %(default)s.")
-parser.add_argument(
-    "--num-genotypes",
-    type=int,
-    metavar="N",
-    default=25,
-    help="Used when calibrrating a presentation predictor. Number of genotypes"
-    "to sample")
-parser.add_argument(
-    "--alleles-per-genotype",
-    type=int,
-    metavar="N",
-    default=6,
-    help="Used when calibrating a presentation predictor. Number of alleles "
-    "per genotype. Use 1 to calibrate for single alleles. Default: %(default)s")
-parser.add_argument(
-    "--motif-summary",
-    default=False,
-    action="store_true",
-    help="Calculate motifs and length preferences for each allele")
-parser.add_argument(
-    "--summary-top-peptide-fraction",
-    default=[0.0001, 0.001, 0.01, 0.1, 1.0],
-    nargs="+",
-    type=float,
-    metavar="X",
-    help="The top X fraction of predictions (i.e. tightest binders) to use to "
-    "generate motifs and length preferences. Default: %(default)s")
-parser.add_argument(
-    "--length-range",
-    default=(8, 15),
-    type=int,
-    nargs=2,
-    help="Min and max peptide length to calibrate, inclusive. "
-    "Default: %(default)s")
-parser.add_argument(
-    "--prediction-batch-size",
-    type=int,
-    default=4096,
-    help="Keras batch size for predictions")
-parser.add_argument(
-    "--alleles-per-work-chunk",
-    type=int,
-    metavar="N",
-    default=1,
-    help="Number of alleles per work chunk. Default: %(default)s.")
-parser.add_argument(
-    "--verbosity",
-    type=int,
-    help="Keras verbosity. Default: %(default)s",
-    default=0)
-
-add_local_parallelism_args(parser)
-add_cluster_parallelism_args(parser)
-
-
-def run(argv=sys.argv[1:]):
-    global GLOBAL_DATA
-
-    # On sigusr1 print stack trace
-    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
-    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
-
-    args = parser.parse_args(argv)
-
-    args.models_dir = os.path.abspath(args.models_dir)
-
-    configure_logging(verbose=args.verbosity > 1)
-
-    aa_distribution = None
-    if args.match_amino_acid_distribution_data:
-        distribution_peptides = pandas.read_csv(
-            args.match_amino_acid_distribution_data).peptide.unique()
-        aa_distribution = amino_acid_distribution(distribution_peptides)
-        print("Using amino acid distribution:")
-        print(aa_distribution)
-
-    start = time.time()
-    peptides = []
-    lengths = range(args.length_range[0], args.length_range[1] + 1)
-    for length in lengths:
-        peptides.extend(
-            random_peptides(
-                args.num_peptides_per_length,
-                length,
-                distribution=aa_distribution))
-    print("Done generating peptides in %0.2f sec." % (time.time() - start))
-
-    if args.predictor_kind == "class1_affinity":
-        return run_class1_affinity_predictor(args, peptides)
-    elif args.predictor_kind == "class1_presentation":
-        return run_class1_presentation_predictor(args, peptides)
-    else:
-        raise ValueError("Unsupported kind %s" % args.predictor_kind)
-
-
-def run_class1_presentation_predictor(args, peptides):
-    # This will trigger a Keras import - will break local parallelism.
-    predictor = Class1PresentationPredictor.load(args.models_dir)
-
-    if args.allele:
-        alleles = [normalize_allele_name(a) for a in args.allele]
-    elif args.alleles_file:
-        alleles = pandas.read_csv(args.alleles_file).allele.unique()
-    else:
-        alleles = predictor.supported_alleles
-
-    print("Num alleles", len(alleles))
-
-    genotypes = {}
-    if args.alleles_per_genotype == 6:
-        gene_to_alleles = collections.defaultdict(list)
-        for a in alleles:
-            for gene in ["A", "B", "C"]:
-                if a.startswith("HLA-%s" % gene):
-                    gene_to_alleles[gene].append(a)
-
-        for _ in range(args.num_genotypes):
-            genotype = []
-            for gene in ["A", "A", "B", "B", "C", "C"]:
-                genotype.append(numpy.random.choice(gene_to_alleles[gene]))
-            genotypes[",".join(genotype)] = genotype
-    elif args.alleles_per_genotype == 1:
-        for _ in range(args.num_genotypes):
-            genotype = [numpy.random.choice(alleles)]
-            genotypes[",".join(genotype)] = genotype
-    else:
-        raise ValueError("Alleles per genotype must be 6 or 1")
-
-    print("Sampled genotypes: ", list(genotypes))
-    print("Num peptides: ", len(peptides))
-
-    start = time.time()
-    print("Generating predictions")
-    predictions_df = predictor.predict(
-        peptides=peptides,
-        alleles=genotypes)
-    print("Finished in %0.2f sec." % (time.time() - start))
-    print(predictions_df)
-
-    print("Calibrating ranks")
-    scores = predictions_df.presentation_score.values
-    predictor.calibrate_percentile_ranks(scores)
-    print("Done. Saving.")
-
-    predictor.save(
-        args.models_dir,
-        write_affinity_predictor=False,
-        write_processing_predictor=False,
-        write_weights=False,
-        write_percent_ranks=True,
-        write_info=False,
-        write_metdata=False)
-    print("Wrote predictor to: %s" % args.models_dir)
-
-
-def run_class1_affinity_predictor(args, peptides):
-    global GLOBAL_DATA
-
-    # It's important that we don't trigger a Keras import here since that breaks
-    # local parallelism (tensorflow backend). So we set optimization_level=0.
-    predictor = Class1AffinityPredictor.load(
-        args.models_dir,
-        optimization_level=0,
-    )
-
-    if args.allele:
-        alleles = [normalize_allele_name(a) for a in args.allele]
-    elif args.alleles_file:
-        alleles = pandas.read_csv(args.alleles_file).allele.unique()
-    else:
-        alleles = predictor.supported_alleles
-
-    allele_set = set(alleles)
-
-    if predictor.allele_to_sequence:
-        # Remove alleles that have the same sequence.
-        new_allele_set = set()
-        sequence_to_allele = collections.defaultdict(set)
-        for allele in list(allele_set):
-            sequence_to_allele[predictor.allele_to_sequence[allele]].add(allele)
-        for equivalent_alleles in sequence_to_allele.values():
-            equivalent_alleles = sorted(equivalent_alleles)
-            keep = equivalent_alleles.pop(0)
-            new_allele_set.add(keep)
-        print(
-            "Sequence comparison reduced num alleles from",
-            len(allele_set),
-            "to",
-            len(new_allele_set))
-        allele_set = new_allele_set
-
-    alleles = sorted(allele_set)
-
-    print("Percent rank calibration for %d alleles. " % (len(alleles)))
-
-    print("Encoding %d peptides." % len(peptides))
-    start = time.time()
-    encoded_peptides = EncodableSequences.create(peptides)
-    del peptides
-
-    # Now we encode the peptides for each neural network, so the encoding
-    # becomes cached.
-    for network in predictor.neural_networks:
-        network.peptides_to_network_input(encoded_peptides)
-    assert encoded_peptides.encoding_cache  # must have cached the encoding
-    print("Finished encoding peptides in %0.2f sec." % (time.time() - start))
-
-    # Store peptides in global variable so they are in shared memory
-    # after fork, instead of needing to be pickled (when doing a parallel run).
-    GLOBAL_DATA["calibration_peptides"] = encoded_peptides
-    GLOBAL_DATA["predictor"] = predictor
-    GLOBAL_DATA["args"] = {
-        'motif_summary': args.motif_summary,
-        'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
-        'verbose': args.verbosity > 0,
-        'model_kwargs': {
-            'batch_size': args.prediction_batch_size,
-        }
-    }
-    del encoded_peptides
-
-    serial_run = not args.cluster_parallelism and args.num_jobs == 0
-    worker_pool = None
-    start = time.time()
-
-    work_items = []
-    for allele in alleles:
-        if not work_items or len(
-                work_items[-1]['alleles']) >= args.alleles_per_work_chunk:
-            work_items.append({"alleles": []})
-        work_items[-1]['alleles'].append(allele)
-
-    if serial_run:
-        # Serial run
-        print("Running in serial.")
-        results = (
-            do_class1_affinity_calibrate_percentile_ranks(**item) for item in work_items)
-    elif args.cluster_parallelism:
-        # Run using separate processes HPC cluster.
-        print("Running on cluster.")
-        results = cluster_results_from_args(
-            args,
-            work_function=do_class1_affinity_calibrate_percentile_ranks,
-            work_items=work_items,
-            constant_data=GLOBAL_DATA,
-            result_serialization_method="pickle",
-            clear_constant_data=True)
-    else:
-        worker_pool = worker_pool_with_gpu_assignments_from_args(args)
-        print("Worker pool", worker_pool)
-        assert worker_pool is not None
-
-        for item in work_items:
-            item['constant_data'] = GLOBAL_DATA
-
-        results = worker_pool.imap_unordered(
-            partial(call_wrapped_kwargs, do_class1_affinity_calibrate_percentile_ranks),
-            work_items,
-            chunksize=1)
-
-    summary_results_lists = collections.defaultdict(list)
-    for work_item in tqdm.tqdm(results, total=len(work_items)):
-        for (transforms, summary_results) in work_item:
-            predictor.allele_to_percent_rank_transform.update(transforms)
-            if summary_results is not None:
-                for (item, value) in summary_results.items():
-                    summary_results_lists[item].append(value)
-    print("Done calibrating %d alleles." % len(alleles))
-    if summary_results_lists:
-        for (name, lst) in summary_results_lists.items():
-            df = pandas.concat(lst, ignore_index=True)
-            predictor.metadata_dataframes[name] = df
-            print("Including summary result: %s" % name)
-            print(df)
-
-    predictor.save(args.models_dir, model_names_to_write=[])
-
-    percent_rank_calibration_time = time.time() - start
-
-    if worker_pool:
-        worker_pool.close()
-        worker_pool.join()
-
-    print("Percent rank calibration time: %0.2f min." % (
-        percent_rank_calibration_time / 60.0))
-    print("Predictor written to: %s" % args.models_dir)
-
-
-def do_class1_affinity_calibrate_percentile_ranks(
-        alleles, constant_data=GLOBAL_DATA):
-
-    if 'predictor' not in constant_data:
-        raise ValueError("No predictor provided: " + str(constant_data))
-
-    result_list = []
-    for (i, allele) in enumerate(alleles):
-        print("Processing allele", i + 1, "of", len(alleles))
-        result_item = class1_affinity_calibrate_percentile_ranks(
-            allele,
-            constant_data['predictor'],
-            peptides=constant_data['calibration_peptides'],
-            **constant_data["args"])
-        result_list.append(result_item)
-    return result_list
-
-
-def class1_affinity_calibrate_percentile_ranks(
-        allele,
-        predictor,
-        peptides=None,
-        motif_summary=False,
-        summary_top_peptide_fractions=[0.001],
-        verbose=False,
-        model_kwargs={}):
-    if verbose:
-        print("Calibrating", allele)
-    predictor.optimize()  # since we loaded with optimization_level=0
-    start = time.time()
-    summary_results = predictor.calibrate_percentile_ranks(
-        peptides=peptides,
-        alleles=[allele],
-        motif_summary=motif_summary,
-        summary_top_peptide_fractions=summary_top_peptide_fractions,
-        verbose=verbose,
-        model_kwargs=model_kwargs)
-    if verbose:
-        print("Done calibrating", allele, "in", time.time() - start, "sec")
-    transforms = {
-        allele: predictor.allele_to_percent_rank_transform[allele],
-    }
-    return (transforms, summary_results)
-
-
-if __name__ == '__main__':
-    run()
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
--- a/mhcflurry/class1_processing_neural_network.py
+++ b/mhcflurry/class1_processing_neural_network.py
--- a/mhcflurry/class1_processing_predictor.py
+++ b/mhcflurry/class1_processing_predictor.py
--- a/mhcflurry/cluster_parallelism.py
+++ b/mhcflurry/cluster_parallelism.py
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
--- a/mhcflurry/data_dependent_weights_initialization.py
+++ b/mhcflurry/data_dependent_weights_initialization.py
-"""
-Layer-sequential unit-variance initialization for neural networks.
-
-See:
-    Mishkin and Matas, "All you need is a good init". 2016.
-    https://arxiv.org/abs/1511.06422
-"""
-#
-# LSUV initialization code in this file is adapted from:
-#   https://github.com/ducha-aiki/LSUV-keras/blob/master/lsuv_init.py
-# by Dmytro Mishkin
-#
-# Here is the license for the original code:
-#
-#
-# Copyright (C) 2017, Dmytro Mishkin
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the
-#    distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from __future__ import print_function
-import numpy
-
-from .common import configure_tensorflow
-
-
-def svd_orthonormal(shape):
-    # Orthonormal init code is from Lasagne
-    # https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py
-    if len(shape) < 2:
-        raise RuntimeError("Only shapes of length 2 or more are supported.")
-    flat_shape = (shape[0], numpy.prod(shape[1:]))
-    a = numpy.random.standard_normal(flat_shape).astype("float32")
-    u, _, v = numpy.linalg.svd(a, full_matrices=False)
-    q = u if u.shape == flat_shape else v
-    q = q.reshape(shape)
-    return q
-
-
-def get_activations(model, layer, X_batch):
-    configure_tensorflow()
-    from tensorflow.keras.models import Model
-    intermediate_layer_model = Model(
-        inputs=model.get_input_at(0),
-        outputs=layer.get_output_at(0)
-    )
-    activations = intermediate_layer_model.predict(X_batch)
-    return activations
-
-
-def lsuv_init(model, batch, verbose=True, margin=0.1, max_iter=100):
-    """
-    Initialize neural network weights using layer-sequential unit-variance
-    initialization.
-
-    See:
-        Mishkin and Matas, "All you need is a good init". 2016.
-        https://arxiv.org/abs/1511.06422
-
-    Parameters
-    ----------
-    model : keras.Model
-    batch : dict
-        Training data, as would be passed keras.Model.fit()
-    verbose : boolean
-        Whether to print progress to stdout
-    margin : float
-    max_iter : int
-
-    Returns
-    -------
-    keras.Model
-        Same as what was passed in.
-    """
-    configure_tensorflow()
-    from tensorflow.keras.layers import Dense, Convolution2D
-    needed_variance = 1.0
-    layers_inintialized = 0
-    for layer in model.layers:
-        if not isinstance(layer, (Dense, Convolution2D)):
-            continue
-        # avoid small layers where activation variance close to zero, esp.
-        # for small batches_generator
-        if numpy.prod(layer.get_output_shape_at(0)[1:]) < 32:
-            if verbose:
-                print('LSUV initialization skipping', layer.name)
-            continue
-        layers_inintialized += 1
-        weights_and_biases = layer.get_weights()
-        weights_and_biases[0] = svd_orthonormal(weights_and_biases[0].shape)
-        layer.set_weights(weights_and_biases)
-        activations = get_activations(model, layer, batch)
-        variance = numpy.var(activations)
-        iteration = 0
-        if verbose:
-            print(layer.name, variance)
-        while abs(needed_variance - variance) > margin:
-            if verbose:
-                print(
-                    'LSUV initialization',
-                    layer.name,
-                    iteration,
-                    needed_variance,
-                    margin,
-                    variance)
-
-            if numpy.abs(numpy.sqrt(variance)) < 1e-7:
-                break  # avoid zero division
-
-            weights_and_biases = layer.get_weights()
-            weights_and_biases[0] /= numpy.sqrt(variance) / numpy.sqrt(
-                needed_variance)
-            layer.set_weights(weights_and_biases)
-            activations = get_activations(model, layer, batch)
-            variance = numpy.var(activations)
-
-            iteration += 1
-            if iteration >= max_iter:
-                break
-    if verbose:
-        print('Done with LSUV: total layers initialized', layers_inintialized)
-    return model
\ No newline at end of file
--- a/mhcflurry/downloads.py
+++ b/mhcflurry/downloads.py
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
--- a/mhcflurry/downloads_command.py
+++ b/mhcflurry/downloads_command.py
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
--- a/mhcflurry/ensemble_centrality.py
+++ b/mhcflurry/ensemble_centrality.py
-"""
-Measures of centrality (e.g. mean) used to combine predictions across an
-ensemble. The input to these functions are log affinities, and they are expected
-to return a centrality measure also in log-space.
-"""
-
-import numpy
-from functools import partial
-
-
-def robust_mean(log_values):
-    """
-    Mean of values falling within the 25-75 percentiles.
-
-    Parameters
-    ----------
-    log_values : 2-d numpy.array
-        Center is computed along the second axis (i.e. per row).
-
-    Returns
-    -------
-    center : numpy.array of length log_values.shape[1]
-
-    """
-    if log_values.shape[1] <= 3:
-        # Too few values to use robust mean.
-        return numpy.nanmean(log_values, axis=1)
-    without_nans = numpy.nan_to_num(log_values)  # replace nan with 0
-    mask = (
-        (~numpy.isnan(log_values)) &
-        (without_nans <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) &
-        (without_nans >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1))))
-    return (without_nans * mask.astype(float)).sum(1) / mask.sum(1)
-
-
-CENTRALITY_MEASURES = {
-    "mean": partial(numpy.nanmean, axis=1),
-    "median": partial(numpy.nanmedian, axis=1),
-    "robust_mean": robust_mean,
-}
--- a/mhcflurry/fasta.py
+++ b/mhcflurry/fasta.py
--- a/mhcflurry/flanking_encoding.py
+++ b/mhcflurry/flanking_encoding.py