small fixes

5d0bf7d5 · Tim O'Donnell · 5ecb11c7 · 5d0bf7d5 · 5d0bf7d5 · 5d0bf7d5
Commit 5d0bf7d5 authored 8 years ago by Tim O'Donnell
--- a/examples/class1_allele_specific_models.ipynb
+++ b/examples/class1_allele_specific_models.ipynb
@@ -2,21 +2,11 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using Theano backend.\n",
-      "/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n",
-      "  warnings.warn(self.msg_depr % (key, alt_key))\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import mhcflurry\n",
    "import numpy\n",

 %% Cell type:code id: tags:

 ``` python
 import mhcflurry
 import numpy
 import seaborn
 import logging
 from matplotlib import pyplot

 % matplotlib inline
 logging.basicConfig(level="DEBUG")
 ```

-%% Output
-
-    Using Theano backend.
-    /Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
-      warnings.warn(self.msg_depr % (key, alt_key))
-
 %% Cell type:markdown id: tags:

 # Making predictions
 Note: if you haven't already, run `mhcflurry-downloads fetch` in a shell to download the trained models.

 %% Cell type:markdown id: tags:

 ## Simplest way to run predictions: `mhcflurry.predict()`

 %% Cell type:code id: tags:

 ``` python
 help(mhcflurry.predict)
 ```

 %% Output

    Help on function predict in module mhcflurry.predict:
    
    predict(alleles, peptides, loaders=None)
        Make predictions across all combinations of the specified alleles and
        peptides.
    
        Parameters
        ----------
        alleles : list of str
            Names of alleles to make predictions for.
    
        peptides : list of str
            Peptide amino acid sequences.
    
        loaders : list of Class1AlleleSpecificPredictorLoader, optional
            Loaders to try. Will be tried in the order given.
    
        Returns DataFrame with columns "Allele", "Peptide", and "Prediction"
    

 %% Cell type:code id: tags:

 ``` python
 mhcflurry.predict(alleles=["HLA-A0201"], peptides=["SIINFEKL", "SIINFEQL"])
 ```

 %% Output

          Allele   Peptide    Prediction
    0  HLA-A0201  SIINFEKL  10672.347656
    1  HLA-A0201  SIINFEQL   7828.974121

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 ## Instantiating a model

 %% Cell type:code id: tags:

 ``` python
 model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
 model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
 ```

 %% Output

    array([ 10672.34765625,  30577.02539062,  10565.78222656], dtype=float32)

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 ## Instantiating a model from a custom set of models on disk

 %% Cell type:code id: tags:

 ``` python
 models_dir = mhcflurry.downloads.get_path("models_class1_allele_specific_single")
 models_dir
 ```

 %% Output

    '/Users/tim/Library/Application Support/mhcflurry/4/0.0.8/models_class1_allele_specific_single/'

 %% Cell type:code id: tags:

 ``` python
 # Make a Loader first
 loader = mhcflurry.class1_allele_specific.load.Class1AlleleSpecificPredictorLoader(models_dir)
 model = loader.from_allele_name("HLA-A0201")
 model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
 ```

 %% Output

    array([ 10672.34765625,  30577.02539062,  10565.78222656], dtype=float32)

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 # Loading a `Dataset`

 %% Cell type:code id: tags:

 ``` python
 full_training_data = mhcflurry.dataset.Dataset.from_csv(
    mhcflurry.downloads.get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
 full_training_data
 ```

 %% Output

    Dataset(n=192550, alleles=['ELA-A1', 'Gogo-B0101', 'H-2-DB', 'H-2-DD', 'H-2-KB', 'H-2-KBM8', 'H-2-KD', 'H-2-KK', 'H-2-LD', 'H-2-LQ', 'HLA-A0101', 'HLA-A0201', 'HLA-A0202', 'HLA-A0203', 'HLA-A0204', 'HLA-A0205', 'HLA-A0206', 'HLA-A0207', 'HLA-A0210', 'HLA-A0211', 'HLA-A0212', 'HLA-A0216', 'HLA-A0217', 'HLA-A0219', 'HLA-A0250', 'HLA-A0301', 'HLA-A0302', 'HLA-A0319', 'HLA-A1', 'HLA-A11', 'HLA-A1101', 'HLA-A1102', 'HLA-A2', 'HLA-A2301', 'HLA-A24', 'HLA-A2402', 'HLA-A2403', 'HLA-A2501', 'HLA-A26', 'HLA-A2601', 'HLA-A2602', 'HLA-A2603', 'HLA-A2902', 'HLA-A3', 'HLA-A3/11', 'HLA-A3001', 'HLA-A3002', 'HLA-A3101', 'HLA-A3201', 'HLA-A3207', 'HLA-A3215', 'HLA-A3301', 'HLA-A6601', 'HLA-A6801', 'HLA-A6802', 'HLA-A6823', 'HLA-A6901', 'HLA-A7401', 'HLA-A8001', 'HLA-B0702', 'HLA-B0801', 'HLA-B0802', 'HLA-B0803', 'HLA-B1401', 'HLA-B1402', 'HLA-B1501', 'HLA-B1502', 'HLA-B1503', 'HLA-B1509', 'HLA-B1517', 'HLA-B1542', 'HLA-B1801', 'HLA-B27', 'HLA-B2701', 'HLA-B2702', 'HLA-B2703', 'HLA-B2704', 'HLA-B2705', 'HLA-B2706', 'HLA-B2710', 'HLA-B2720', 'HLA-B3501', 'HLA-B3503', 'HLA-B3508', 'HLA-B3701', 'HLA-B3801', 'HLA-B39', 'HLA-B3901', 'HLA-B40', 'HLA-B4001', 'HLA-B4002', 'HLA-B4013', 'HLA-B4201', 'HLA-B4202', 'HLA-B44', 'HLA-B4402', 'HLA-B4403', 'HLA-B4501', 'HLA-B4506', 'HLA-B4601', 'HLA-B4801', 'HLA-B51', 'HLA-B5101', 'HLA-B5201', 'HLA-B5301', 'HLA-B5401', 'HLA-B5701', 'HLA-B5702', 'HLA-B5703', 'HLA-B58', 'HLA-B5801', 'HLA-B5802', 'HLA-B60', 'HLA-B62', 'HLA-B7', 'HLA-B7301', 'HLA-B8', 'HLA-B8101', 'HLA-B8301', 'HLA-BOLA102101', 'HLA-BOLA200801', 'HLA-BOLA201201', 'HLA-BOLA402401', 'HLA-BOLA601301', 'HLA-BOLA601302', 'HLA-BOLAHD6', 'HLA-C0303', 'HLA-C0401', 'HLA-C0501', 'HLA-C0602', 'HLA-C0702', 'HLA-C0802', 'HLA-C1', 'HLA-C1203', 'HLA-C1402', 'HLA-C1502', 'HLA-C4', 'HLA-E0101', 'HLA-E0103', 'HLA-EQCA100101', 'HLA-RT1A', 'HLA-RT1BL', 'HLA-SLA10401', 'Mamu-A01', 'Mamu-A02', 'Mamu-A07', 'Mamu-A100101', 'Mamu-A100201', 'Mamu-A101101', 'Mamu-A11', 'Mamu-A20102', 'Mamu-A2201', 'Mamu-A2601', 'Mamu-A70103', 'Mamu-B01', 'Mamu-B01704', 'Mamu-B03', 'Mamu-B04', 'Mamu-B06502', 'Mamu-B08', 'Mamu-B1001', 'Mamu-B17', 'Mamu-B3901', 'Mamu-B52', 'Mamu-B6601', 'Mamu-B8301', 'Mamu-B8701', 'Patr-A0101', 'Patr-A0301', 'Patr-A0401', 'Patr-A0602', 'Patr-A0701', 'Patr-A0901', 'Patr-B0101', 'Patr-B0901', 'Patr-B1301', 'Patr-B1701', 'Patr-B2401'])

 %% Cell type:code id: tags:

 ``` python
 kim2014_full = mhcflurry.dataset.Dataset.from_csv(
    mhcflurry.downloads.get_path("data_kim2014", "bdata.20130222.mhci.public.1.txt"))

 kim2014_train = mhcflurry.dataset.Dataset.from_csv(
    mhcflurry.downloads.get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
 kim2014_test = mhcflurry.dataset.Dataset.from_csv(
    mhcflurry.downloads.get_path("data_kim2014", "bdata.2013.mhci.public.blind.1.txt"))

 len(kim2014_full), len(kim2014_train), len(kim2014_test)
 ```

 %% Output

    (179692, 137654, 27680)

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 # Predicting affinities from a `Dataset`

 %% Cell type:code id: tags:

 ``` python
 model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
 model.predict(kim2014_train.get_allele("HLA-A0201").peptides)
 ```

 %% Output

    array([  3514.14550781,  12429.5390625 ,   4227.02197266, ...,
             5949.32763672,  17837.0859375 ,   6724.96728516], dtype=float32)

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 # Fit a model

 %% Cell type:code id: tags:

 ``` python
 help(mhcflurry.class1_allele_specific.Class1BindingPredictor)
 ```

 %% Output

    Help on class Class1BindingPredictor in module mhcflurry.class1_allele_specific.class1_binding_predictor:
    
    class Class1BindingPredictor(mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase)
     |  Allele-specific Class I MHC binding predictor which uses
     |  fixed-length (k-mer) index encoding for inputs and outputs
     |  a value between 0 and 1 (where 1 is the strongest binder).
     |
     |  Method resolution order:
     |      Class1BindingPredictor
     |      mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase
     |      mhcflurry.ic50_predictor_base.IC50PredictorBase
     |      builtins.object
     |
     |  Methods defined here:
     |
     |  __getstate__(self)
     |
     |  __init__(self, model=None, name=None, max_ic50=50000.0, allow_unknown_amino_acids=True, kmer_size=9, n_amino_acids=20, verbose=False, **hyperparameters)
     |      Initialize self.  See help(type(self)) for accurate signature.
     |
     |  __setstate__(self, state)
     |
     |  fit_kmer_encoded_arrays(self, X, ic50, sample_weights=None, right_censoring_mask=None, X_pretrain=None, ic50_pretrain=None, sample_weights_pretrain=None, n_random_negative_samples=None, pretrain_decay=None, n_training_epochs=None, batch_size=None, verbose=False)
     |      Train predictive model from index encoding of fixed length k-mer
     |      peptides.
     |
     |      Parameters
     |      ----------
     |      X : array
     |          Training data with shape (n_samples, n_dims)
     |
     |      ic50 : array
     |          Training IC50 values with shape (n_samples,)
     |
     |      sample_weights : array
     |          Weight of each training sample with shape (n_samples,)
     |
     |      right_censoring_mask : array, optional
     |          Boolean array which indicates whether each IC50 value is actually
     |          right censored (a lower bound on the true value). Censored values
     |          are transformed during training by sampling between the observed
     |          and maximum values on each iteration.
     |
     |      X_pretrain : array
     |          Extra samples used for soft pretraining of the predictor,
     |          should have same number of dimensions as X.
     |          During training the weights of these samples will decay after
     |          each epoch.
     |
     |      ic50_pretrain : array
     |          IC50 values for extra samples, shape
     |
     |      pretrain_decay : int -> float function
     |          decay function for pretraining, mapping epoch number to decay
     |          factor
     |
     |      sample_weights_pretrain : array
     |          Initial weights for the rows of X_pretrain. If not specified then
     |          initialized to ones.
     |
     |      n_random_negative_samples : int
     |          Number of random samples to generate as negative examples.
     |
     |      n_training_epochs : int
     |
     |      verbose : bool
     |
     |      batch_size : int
     |
     |  get_weights(self)
     |      Returns weights, which can be passed to set_weights later.
     |
     |  predict_ic50_for_kmer_encoded_array(self, X)
     |      Given an encoded array of amino acid indices,
     |      returns a vector of IC50 predictions.
     |
     |  predict_scores_for_kmer_encoded_array(self, X)
     |      Given an encoded array of amino acid indices, returns a vector
     |      of affinity scores (values between 0 and 1).
     |
     |  set_weights(self, weights)
     |      Reset the model weights.
     |
     |  ----------------------------------------------------------------------
     |  Data and other attributes defined here:
     |
     |  fit_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperparamete...
     |
     |  hyperparameter_defaults = <mhcflurry.hyperparameters.HyperparameterDef...
     |
     |  network_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperpara...
     |
     |  ----------------------------------------------------------------------
     |  Methods inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
     |
     |  __repr__(self)
     |      Return repr(self).
     |
     |  __str__(self)
     |      Return str(self).
     |
     |  encode_peptides(self, peptides)
     |      Parameters
     |      ----------
     |      peptides : str list
     |          Peptide strings of any length
     |
     |      Encode peptides of any length into fixed length vectors.
     |      Returns 2d array of encoded peptides and 1d array indicating the
     |      original peptide index for each row.
     |
     |  fit_dataset(self, dataset, pretraining_dataset=None, sample_censored_affinities=False, **kwargs)
     |      Fit the model parameters on the given training data.
     |
     |      Parameters
     |      ----------
     |      dataset : Dataset
     |
     |      pretraining_dataset : Dataset
     |
     |      sample_censored_affinities : bool
     |          If a column named 'inequality' is in the Dataset then every
     |          peptide with a value of '>' on each training epoch, gets a
     |          randomly sampled IC50 between its observed value and the
     |          max_ic50 of the predictor. Default is False.
     |
     |      **kwargs : dict
     |          Extra arguments are passed on to the fit_encoded_kmer_arrays()
     |          method.
     |
     |  predict_ic50_for_kmer_peptides(self, peptides)
     |
     |  predict_scores(self, peptides, combine_fn=<function mean at 0x109180ae8>)
     |      Given a list of peptides of any length, returns an array of predicted
     |      normalized affinity values. Unlike IC50, a higher value here
     |      means a stronger affinity. Peptides of lengths other than 9 are
     |      transformed into a set of k-mers either by deleting or inserting
     |      amino acid characters. The prediction for a single peptide will be
     |      the average of expanded k-mers.
     |
     |  predict_scores_for_kmer_peptides(self, peptides)
     |      Predict binding affinity for 9mer peptides
     |
     |  ----------------------------------------------------------------------
     |  Data descriptors inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
     |
     |  amino_acids
     |      Amino acid alphabet used for encoding peptides, may include
     |      "X" if allow_unknown_amino_acids is True.
     |
     |  max_amino_acid_encoding_value
     |
     |  ----------------------------------------------------------------------
     |  Methods inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
     |
     |  fit_dictionary(self, peptide_to_ic50_dict, **kwargs)
     |      Fit the model parameters using the given peptide->IC50 dictionary,
     |      all samples are given the same weight.
     |
     |      Parameters
     |      ----------
     |      peptide_to_ic50_dict : dict
     |          Dictionary that maps peptides to IC50 values.
     |
     |  fit_sequences(self, peptides, affinities, sample_weights=None, alleles=None, **kwargs)
     |
     |  predict(self, peptides)
     |      Predict IC50 affinities for peptides of any length
     |
     |  ----------------------------------------------------------------------
     |  Data descriptors inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
     |
     |  __dict__
     |      dictionary for instance variables (if defined)
     |
     |  __weakref__
     |      list of weak references to the object (if defined)
    

 %% Cell type:code id: tags:

 ``` python
 train_data = kim2014_train.get_allele("HLA-A3301")
 train_data
 ```

 %% Output

    Dataset(n=3040, alleles=['HLA-A3301'])

 %% Cell type:code id: tags:

 ``` python
 # We'll use the default hyper parameters here. Could also specify them as kwargs:
 new_model = mhcflurry.class1_allele_specific.Class1BindingPredictor()
 new_model.hyperparameters
 ```

 %% Output

    {'activation': 'tanh',
     'batch_normalization': True,
     'batch_size': 128,
     'dropout_probability': 0.0,
     'embedding_output_dim': 32,
     'fraction_negative': 0.0,
     'init': 'glorot_uniform',
     'kmer_size': 9,
     'layer_sizes': [64],
     'loss': 'mse',
     'max_ic50': 50000.0,
     'n_training_epochs': 250,
     'optimizer': 'rmsprop',
     'output_activation': 'sigmoid',
     'pretrain_decay': 'numpy.exp(-epoch)'}

 %% Cell type:code id: tags:

 ``` python
 # This will run faster if you have a GPU.
 %time new_model.fit_dataset(train_data)
 ```

 %% Output

    CPU times: user 1min 22s, sys: 1.13 s, total: 1min 24s
    Wall time: 45.8 s

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 ## Evaluate the fit model on held-out test data

 %% Cell type:markdown id: tags:

 ### Generate predictions

 %% Cell type:code id: tags:

 ``` python
 test_data = kim2014_test.get_allele("HLA-A3301")
 predictions = new_model.predict(test_data.peptides)

 seaborn.set_context('notebook')
 seaborn.regplot(numpy.log10(test_data.affinities), numpy.log10(predictions))
 pyplot.xlim(xmin=0)
 pyplot.ylim(ymin=0)
 pyplot.xlabel("Measured affinity (log10 nM)")
 pyplot.ylabel("Predicted affinity (log10 nM)")
 pyplot.title("MHCflurry on test data")
 ```

 %% Output

    <matplotlib.text.Text at 0x122e50400>



 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 ### Calculate AUC, F1, and Kendall's Tau scores

 %% Cell type:code id: tags:

 ``` python
 help(mhcflurry.class1_allele_specific.scoring.make_scores)
 ```

 %% Output

    Help on function make_scores in module mhcflurry.class1_allele_specific.scoring:
    
    make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500, max_ic50=50000)
        Calculate AUC, F1, and Kendall Tau scores.
    
        Parameters
        -----------
        ic50_y : float list
            true IC50s (i.e. affinities)
    
        ic50_y_pred : float list
            predicted IC50s
    
        sample_weight : float list [optional]
    
        threshold_nm : float [optional]
    
        max_ic50 : float [optional]
    
        Returns
        -----------
        dict with entries "auc", "f1", "tau"
    

 %% Cell type:code id: tags:

 ``` python
 mhcflurry.class1_allele_specific.scoring.make_scores(test_data.affinities, predictions)
 ```

 %% Output

    {'auc': 0.84099099099099106,
     'f1': 0.65531914893617027,
     'tau': 0.43387627983717181}

 %% Cell type:markdown id: tags:



 %% Cell type:markdown id: tags:

 ## Cross validation for hyperparameter selection

 %% Cell type:code id: tags:

 ``` python
 help(mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds)
 ```

 %% Output

    Help on function cross_validation_folds in module mhcflurry.class1_allele_specific.cross_validation:
    
    cross_validation_folds(train_data, alleles=None, n_folds=3, drop_similar_peptides=False, imputer=None, impute_kwargs={'min_observations_per_allele': 2, 'min_observations_per_peptide': 2}, parallel_backend=None)
        Split a Dataset into n_folds cross validation folds for each allele,
        optionally performing imputation.
    
        Parameters
        -----------
        train_data : mhcflurry.Dataset
    
        alleles : string list, optional
            Alleles to run cross validation on. Default: all alleles in
            train_data.
    
        n_folds : int, optional
            Number of cross validation folds for each allele.
    
        drop_similar_peptides : boolean, optional
            For each fold, remove peptides from the test data that are similar
            to peptides in the train data. Similarity is defined as in the
            similar_peptides function.
    
        imputer : fancyimpute.Solver, optional
            Imputer to use. If not specified, no imputation is done.
    
        impute_kwargs : dict, optional
            Additional kwargs to pass to mhcflurry.Dataset.impute_missing_values.
    
        parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
            Futures implementation to use for running on multiple threads,
            processes, or nodes
    
        Returns
        -----------
        list of AlleleSpecificTrainTestFold of length num alleles * n_folds
    

 %% Cell type:code id: tags:

 ``` python
 folds = mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds(train_data)
 folds
 ```

 %% Output

    [AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2026, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1014, alleles=['HLA-A3301'])),
     AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301'])),
     AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301']))]

 %% Cell type:code id: tags:

 ``` python
 # Take a look at what hyperparameters are available for searching over.
 mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.defaults
 ```

 %% Output

    {'activation': 'tanh',
     'batch_normalization': True,
     'batch_size': 128,
     'dropout_probability': 0.0,
     'embedding_output_dim': 32,
     'fraction_negative': 0.0,
     'impute': False,
     'init': 'glorot_uniform',
     'kmer_size': 9,
     'layer_sizes': [64],
     'loss': 'mse',
     'max_ic50': 50000.0,
     'n_training_epochs': 250,
     'optimizer': 'rmsprop',
     'output_activation': 'sigmoid',
     'pretrain_decay': 'numpy.exp(-epoch)'}

 %% Cell type:code id: tags:

 ``` python
 models_to_search = mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.models_grid(
    fraction_negative=[.1],
    layer_sizes=[[8], [12]])
 print("Searching over %d models." % len(models_to_search))
 print("First model: \n%s" % models_to_search[0])
 ```

 %% Output

    Searching over 2 models.
    First model:
    {'output_activation': 'sigmoid', 'pretrain_decay': 'numpy.exp(-epoch)', 'n_training_epochs': 250, 'embedding_output_dim': 32, 'optimizer': 'rmsprop', 'loss': 'mse', 'fraction_negative': 0.1, 'batch_normalization': True, 'dropout_probability': 0.0, 'init': 'glorot_uniform', 'activation': 'tanh', 'batch_size': 128, 'impute': False, 'kmer_size': 9, 'max_ic50': 50000.0, 'layer_sizes': [8]}

 %% Cell type:code id: tags:

 ``` python
 help(mhcflurry.class1_allele_specific.train.train_across_models_and_folds)
 ```

 %% Output

    Help on function train_across_models_and_folds in module mhcflurry.class1_allele_specific.train:
    
    train_across_models_and_folds(folds, model_descriptions, cartesian_product_of_folds_and_models=True, return_predictors=False, folds_per_task=1, parallel_backend=None)
        Train and optionally test any number of models across any number of folds.
    
        Parameters
        -----------
        folds : list of AlleleSpecificTrainTestFold
    
        model_descriptions : list of dict
            Models to test
    
        cartesian_product_of_folds_and_models : boolean, optional
            If true, then a predictor is treained for each fold and model
            description.
            If false, then len(folds) must equal len(model_descriptions), and
            the i'th model is trained on the i'th fold.
    
        return_predictors : boolean, optional
            Include the trained predictors in the result.
    
        parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
            Futures implementation to use for running on multiple threads,
            processes, or nodes
    
        Returns
        -----------
        pandas.DataFrame
    

 %% Cell type:code id: tags:

 ``` python
 results_df = mhcflurry.class1_allele_specific.train.train_across_models_and_folds(
    folds,
    models_to_search,
    return_predictors=True)
 results_df
 ```

 %% Output

          allele  fold_num  model_num  train_size  test_size imputed_train_size  \
    0  HLA-A3301         0          0        2026       1014               None
    1  HLA-A3301         0          1        2026       1014               None
    2  HLA-A3301         1          0        2027       1013               None
    3  HLA-A3301         1          1        2027       1013               None
    4  HLA-A3301         2          0        2027       1013               None
    5  HLA-A3301         2          1        2027       1013               None
    
       train_tau  train_auc  train_f1  test_tau        ...         \
    0   0.710233   0.989589  0.902256  0.429803        ...
    1   0.747597   0.993938  0.919708  0.425610        ...
    2   0.705507   0.990185  0.882466  0.430678        ...
    3   0.745532   0.993875  0.924812  0.395103        ...
    4   0.709275   0.992395  0.894531  0.441365        ...
    5   0.743498   0.994674  0.873518  0.439221        ...
    
       model_fraction_negative  model_batch_normalization  \
    0                      0.1                       True
    1                      0.1                       True
    2                      0.1                       True
    3                      0.1                       True
    4                      0.1                       True
    5                      0.1                       True
    
      model_dropout_probability      model_init  model_activation  \
    0                       0.0  glorot_uniform              tanh
    1                       0.0  glorot_uniform              tanh
    2                       0.0  glorot_uniform              tanh
    3                       0.0  glorot_uniform              tanh
    4                       0.0  glorot_uniform              tanh
    5                       0.0  glorot_uniform              tanh
    
      model_batch_size model_impute  model_kmer_size  model_max_ic50  \
    0              128        False                9         50000.0
    1              128        False                9         50000.0
    2              128        False                9         50000.0
    3              128        False                9         50000.0
    4              128        False                9         50000.0
    5              128        False                9         50000.0
    
      model_layer_sizes
    0               [8]
    1              [12]
    2               [8]
    3              [12]
    4               [8]
    5              [12]
    
    [6 rows x 31 columns]

 %% Cell type:code id: tags:

 ``` python
 # The trained predictors are in the 'predictor' column
 results_df.predictor
 ```

 %% Output

    0    Class1BindingPredictor(name=None, max_ic50=500...
    1    Class1BindingPredictor(name=None, max_ic50=500...
    2    Class1BindingPredictor(name=None, max_ic50=500...
    3    Class1BindingPredictor(name=None, max_ic50=500...
    4    Class1BindingPredictor(name=None, max_ic50=500...
    5    Class1BindingPredictor(name=None, max_ic50=500...
    Name: predictor, dtype: object

 %% Cell type:code id: tags:

 ``` python
 # Which model had the best average AUC across folds?
 results_df.groupby("model_num").test_auc.mean()
 ```

 %% Output

    model_num
    0    0.859859
    1    0.847004
    Name: test_auc, dtype: float64

--- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_released.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_released.py
@@ -21,15 +21,22 @@ class MHCflurryReleased(PresentationComponentModel):
    random_peptides_for_percent_rank : list of string
        If specified, then percentile rank will be calibrated and emitted
        using the given peptides.
+
+    predictor : Class1EnsembleMultiAllelePredictor-like object
+        Predictor to use.
    """

    def __init__(
            self,
            experiment_to_alleles,
            random_peptides_for_percent_rank=None,
+            predictor=None,
+            predictor_name="mhcflurry_released",
            **kwargs):
        PresentationComponentModel.__init__(self, **kwargs)
        self.experiment_to_alleles = experiment_to_alleles
+        self.predictor = predictor
+        self.predictor_name = predictor_name
        if random_peptides_for_percent_rank is None:
            self.percent_rank_transforms = None
            self.random_peptides_for_percent_rank = None
@@ -39,9 +46,9 @@ class MHCflurryReleased(PresentationComponentModel):
                random_peptides_for_percent_rank)

    def column_names(self):
-        columns = ['mhcflurry_released_affinity']
+        columns = [self.predictor_name + '_affinity']
        if self.percent_rank_transforms is not None:
-            columns.append('mhcflurry_released_percentile_rank')
+            columns.append(self.predictor_name + '_percentile_rank')
        return columns

    def requires_fitting(self):
@@ -63,11 +70,14 @@ class MHCflurryReleased(PresentationComponentModel):
            normalize_allele_name(allele)
            for allele in alleles
        ]
-        df = predict(alleles, numpy.unique(numpy.array(peptides)))
+        df = predict(
+            alleles,
+            numpy.unique(numpy.array(peptides)),
+            predictor=self.predictor)
        pivoted = df.pivot(index='Peptide', columns='Allele')
        pivoted.columns = pivoted.columns.droplevel()
        result = {
-            'mhcflurry_released_affinity': (
+            self.predictor_name + '_affinity': (
                pivoted.min(axis=1).ix[peptides].values)
        }
        if self.percent_rank_transforms is not None:
@@ -77,7 +87,7 @@ class MHCflurryReleased(PresentationComponentModel):
                percentile_ranks[allele] = (
                    self.percent_rank_transforms[allele]
                    .transform(pivoted[allele].values))
-            result['mhcflurry_released_percentile_rank'] = (
+            result[self.predictor_name + '_percentile_rank'] = (
                percentile_ranks.min(axis=1).ix[peptides].values)
        return result


--- a/mhcflurry/class1_allele_specific/__init__.py
+++ b/mhcflurry/class1_allele_specific/__init__.py
@@ -3,7 +3,11 @@ from __future__ import absolute_import
 from .class1_binding_predictor import Class1BindingPredictor
 from .train import train_across_models_and_folds, AlleleSpecificTrainTestFold
 from .cross_validation import cross_validation_folds
-from .class1_single_model_multi_allele_predictor import from_allele_name, supported_alleles
+from .class1_single_model_multi_allele_predictor import (
+    from_allele_name,
+    supported_alleles,
+    get_downloaded_predictor,
+    Class1SingleModelMultiAllelePredictor)

 __all__ = [
    'Class1BindingPredictor',
@@ -12,4 +16,6 @@ __all__ = [
    'train_across_models_and_folds',
    'from_allele_name',
    'supported_alleles',
+    'get_downloaded_predictor',
+    'Class1SingleModelMultiAllelePredictor',
 ]
--- a/mhcflurry/class1_allele_specific/class1_single_model_multi_allele_predictor.py
+++ b/mhcflurry/class1_allele_specific/class1_single_model_multi_allele_predictor.py
@@ -32,7 +32,7 @@ CACHED_PREDICTOR = None

 def from_allele_name(allele_name):
    """
-    Load a predictor for an allele using the default loader.
+    Load a single-allele predictor.

    Parameters
    ----------
@@ -42,7 +42,7 @@ def from_allele_name(allele_name):
    ----------
    Class1BindingPredictor
    """
-    return get_downloaded_predictor().from_allele_name(allele_name)
+    return get_downloaded_predictor().predictor_for_allele(allele_name)


 def supported_alleles():
@@ -65,7 +65,9 @@ def get_downloaded_predictor():
    # different.
    path = get_path("models_class1_allele_specific_single")
    if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
-        CACHED_PREDICTOR = Class1SingleModelMultiAllelePredictor(path)
+        CACHED_PREDICTOR = (
+            Class1SingleModelMultiAllelePredictor
+                .load_from_download_directory(path))
    return CACHED_PREDICTOR



--- a/mhcflurry/class1_allele_specific_ensemble/__init__.py
+++ b/mhcflurry/class1_allele_specific_ensemble/__init__.py
 from .class1_ensemble_multi_allele_predictor import (
    Class1EnsembleMultiAllelePredictor,
+    get_downloaded_predictor,
    HYPERPARAMETER_DEFAULTS)

 __all__ = [
    "Class1EnsembleMultiAllelePredictor",
+    "get_downloaded_predictor",
    "HYPERPARAMETER_DEFAULTS",
 ]
--- a/mhcflurry/class1_allele_specific_ensemble/class1_ensemble_multi_allele_predictor.py
+++ b/mhcflurry/class1_allele_specific_ensemble/class1_ensemble_multi_allele_predictor.py
@@ -34,6 +34,7 @@ import pandas

 from ..hyperparameters import HyperparameterDefaults
 from ..class1_allele_specific import Class1BindingPredictor, scoring
+from ..downloads import get_path
 from .. import parallelism, common

 MEASUREMENT_COLLECTION_HYPERPARAMETER_DEFAULTS = HyperparameterDefaults(
@@ -56,6 +57,35 @@ HYPERPARAMETER_DEFAULTS = (
    .extend(Class1BindingPredictor.hyperparameter_defaults))


+CACHED_PREDICTOR = None
+
+
+def supported_alleles():
+    """
+    Return a list of the names of the alleles for which there are trained
+    predictors in the default laoder.
+    """
+    return get_downloaded_predictor().supported_alleles
+
+
+def get_downloaded_predictor():
+    """
+    Return a Class1AlleleSpecificPredictorLoader that uses downloaded models.
+    """
+    global CACHED_PREDICTOR
+
+    # Some of the unit tests manipulate the downloads directory configuration
+    # so get_path here may return different results in the same Python process.
+    # For this reason we check the path and invalidate the loader if it's
+    # different.
+    path = get_path("models_class1_allele_specific_ensemble")
+    if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
+        CACHED_PREDICTOR = (
+            Class1EnsembleMultiAllelePredictor
+                .load_from_download_directory(path))
+    return CACHED_PREDICTOR
+
+
 def call_fit_and_test(args):
    return fit_and_test(*args)


--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -8,7 +8,7 @@
 # by name, the downloads with "default=true" are downloaded.

 # This should usually be the latest release.
-current-release: 0.0.8
+current-release: 0.1.0

 # An integer indicating what models the current MHCflurry code base is compatible
 # with. Increment this integer when changes are made to MHCflurry that would break