Skip to content
Snippets Groups Projects
Commit 5d0bf7d5 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

small fixes

parent 5ecb11c7
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import mhcflurry
import numpy
import seaborn
import logging
from matplotlib import pyplot
% matplotlib inline
logging.basicConfig(level="DEBUG")
```
%% Output
Using Theano backend.
/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))
%% Cell type:markdown id: tags:
# Making predictions
Note: if you haven't already, run `mhcflurry-downloads fetch` in a shell to download the trained models.
%% Cell type:markdown id: tags:
## Simplest way to run predictions: `mhcflurry.predict()`
%% Cell type:code id: tags:
``` python
help(mhcflurry.predict)
```
%% Output
Help on function predict in module mhcflurry.predict:
predict(alleles, peptides, loaders=None)
Make predictions across all combinations of the specified alleles and
peptides.
Parameters
----------
alleles : list of str
Names of alleles to make predictions for.
peptides : list of str
Peptide amino acid sequences.
loaders : list of Class1AlleleSpecificPredictorLoader, optional
Loaders to try. Will be tried in the order given.
Returns DataFrame with columns "Allele", "Peptide", and "Prediction"
%% Cell type:code id: tags:
``` python
mhcflurry.predict(alleles=["HLA-A0201"], peptides=["SIINFEKL", "SIINFEQL"])
```
%% Output
Allele Peptide Prediction
0 HLA-A0201 SIINFEKL 10672.347656
1 HLA-A0201 SIINFEQL 7828.974121
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Instantiating a model
%% Cell type:code id: tags:
``` python
model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
```
%% Output
array([ 10672.34765625, 30577.02539062, 10565.78222656], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Instantiating a model from a custom set of models on disk
%% Cell type:code id: tags:
``` python
models_dir = mhcflurry.downloads.get_path("models_class1_allele_specific_single")
models_dir
```
%% Output
'/Users/tim/Library/Application Support/mhcflurry/4/0.0.8/models_class1_allele_specific_single/'
%% Cell type:code id: tags:
``` python
# Make a Loader first
loader = mhcflurry.class1_allele_specific.load.Class1AlleleSpecificPredictorLoader(models_dir)
model = loader.from_allele_name("HLA-A0201")
model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
```
%% Output
array([ 10672.34765625, 30577.02539062, 10565.78222656], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Loading a `Dataset`
%% Cell type:code id: tags:
``` python
full_training_data = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
full_training_data
```
%% Output
Dataset(n=192550, alleles=['ELA-A1', 'Gogo-B0101', 'H-2-DB', 'H-2-DD', 'H-2-KB', 'H-2-KBM8', 'H-2-KD', 'H-2-KK', 'H-2-LD', 'H-2-LQ', 'HLA-A0101', 'HLA-A0201', 'HLA-A0202', 'HLA-A0203', 'HLA-A0204', 'HLA-A0205', 'HLA-A0206', 'HLA-A0207', 'HLA-A0210', 'HLA-A0211', 'HLA-A0212', 'HLA-A0216', 'HLA-A0217', 'HLA-A0219', 'HLA-A0250', 'HLA-A0301', 'HLA-A0302', 'HLA-A0319', 'HLA-A1', 'HLA-A11', 'HLA-A1101', 'HLA-A1102', 'HLA-A2', 'HLA-A2301', 'HLA-A24', 'HLA-A2402', 'HLA-A2403', 'HLA-A2501', 'HLA-A26', 'HLA-A2601', 'HLA-A2602', 'HLA-A2603', 'HLA-A2902', 'HLA-A3', 'HLA-A3/11', 'HLA-A3001', 'HLA-A3002', 'HLA-A3101', 'HLA-A3201', 'HLA-A3207', 'HLA-A3215', 'HLA-A3301', 'HLA-A6601', 'HLA-A6801', 'HLA-A6802', 'HLA-A6823', 'HLA-A6901', 'HLA-A7401', 'HLA-A8001', 'HLA-B0702', 'HLA-B0801', 'HLA-B0802', 'HLA-B0803', 'HLA-B1401', 'HLA-B1402', 'HLA-B1501', 'HLA-B1502', 'HLA-B1503', 'HLA-B1509', 'HLA-B1517', 'HLA-B1542', 'HLA-B1801', 'HLA-B27', 'HLA-B2701', 'HLA-B2702', 'HLA-B2703', 'HLA-B2704', 'HLA-B2705', 'HLA-B2706', 'HLA-B2710', 'HLA-B2720', 'HLA-B3501', 'HLA-B3503', 'HLA-B3508', 'HLA-B3701', 'HLA-B3801', 'HLA-B39', 'HLA-B3901', 'HLA-B40', 'HLA-B4001', 'HLA-B4002', 'HLA-B4013', 'HLA-B4201', 'HLA-B4202', 'HLA-B44', 'HLA-B4402', 'HLA-B4403', 'HLA-B4501', 'HLA-B4506', 'HLA-B4601', 'HLA-B4801', 'HLA-B51', 'HLA-B5101', 'HLA-B5201', 'HLA-B5301', 'HLA-B5401', 'HLA-B5701', 'HLA-B5702', 'HLA-B5703', 'HLA-B58', 'HLA-B5801', 'HLA-B5802', 'HLA-B60', 'HLA-B62', 'HLA-B7', 'HLA-B7301', 'HLA-B8', 'HLA-B8101', 'HLA-B8301', 'HLA-BOLA102101', 'HLA-BOLA200801', 'HLA-BOLA201201', 'HLA-BOLA402401', 'HLA-BOLA601301', 'HLA-BOLA601302', 'HLA-BOLAHD6', 'HLA-C0303', 'HLA-C0401', 'HLA-C0501', 'HLA-C0602', 'HLA-C0702', 'HLA-C0802', 'HLA-C1', 'HLA-C1203', 'HLA-C1402', 'HLA-C1502', 'HLA-C4', 'HLA-E0101', 'HLA-E0103', 'HLA-EQCA100101', 'HLA-RT1A', 'HLA-RT1BL', 'HLA-SLA10401', 'Mamu-A01', 'Mamu-A02', 'Mamu-A07', 'Mamu-A100101', 'Mamu-A100201', 'Mamu-A101101', 'Mamu-A11', 'Mamu-A20102', 'Mamu-A2201', 'Mamu-A2601', 'Mamu-A70103', 'Mamu-B01', 'Mamu-B01704', 'Mamu-B03', 'Mamu-B04', 'Mamu-B06502', 'Mamu-B08', 'Mamu-B1001', 'Mamu-B17', 'Mamu-B3901', 'Mamu-B52', 'Mamu-B6601', 'Mamu-B8301', 'Mamu-B8701', 'Patr-A0101', 'Patr-A0301', 'Patr-A0401', 'Patr-A0602', 'Patr-A0701', 'Patr-A0901', 'Patr-B0101', 'Patr-B0901', 'Patr-B1301', 'Patr-B1701', 'Patr-B2401'])
%% Cell type:code id: tags:
``` python
kim2014_full = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.20130222.mhci.public.1.txt"))
kim2014_train = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
kim2014_test = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.2013.mhci.public.blind.1.txt"))
len(kim2014_full), len(kim2014_train), len(kim2014_test)
```
%% Output
(179692, 137654, 27680)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Predicting affinities from a `Dataset`
%% Cell type:code id: tags:
``` python
model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
model.predict(kim2014_train.get_allele("HLA-A0201").peptides)
```
%% Output
array([ 3514.14550781, 12429.5390625 , 4227.02197266, ...,
5949.32763672, 17837.0859375 , 6724.96728516], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Fit a model
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.Class1BindingPredictor)
```
%% Output
Help on class Class1BindingPredictor in module mhcflurry.class1_allele_specific.class1_binding_predictor:
class Class1BindingPredictor(mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase)
| Allele-specific Class I MHC binding predictor which uses
| fixed-length (k-mer) index encoding for inputs and outputs
| a value between 0 and 1 (where 1 is the strongest binder).
|
| Method resolution order:
| Class1BindingPredictor
| mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase
| mhcflurry.ic50_predictor_base.IC50PredictorBase
| builtins.object
|
| Methods defined here:
|
| __getstate__(self)
|
| __init__(self, model=None, name=None, max_ic50=50000.0, allow_unknown_amino_acids=True, kmer_size=9, n_amino_acids=20, verbose=False, **hyperparameters)
| Initialize self. See help(type(self)) for accurate signature.
|
| __setstate__(self, state)
|
| fit_kmer_encoded_arrays(self, X, ic50, sample_weights=None, right_censoring_mask=None, X_pretrain=None, ic50_pretrain=None, sample_weights_pretrain=None, n_random_negative_samples=None, pretrain_decay=None, n_training_epochs=None, batch_size=None, verbose=False)
| Train predictive model from index encoding of fixed length k-mer
| peptides.
|
| Parameters
| ----------
| X : array
| Training data with shape (n_samples, n_dims)
|
| ic50 : array
| Training IC50 values with shape (n_samples,)
|
| sample_weights : array
| Weight of each training sample with shape (n_samples,)
|
| right_censoring_mask : array, optional
| Boolean array which indicates whether each IC50 value is actually
| right censored (a lower bound on the true value). Censored values
| are transformed during training by sampling between the observed
| and maximum values on each iteration.
|
| X_pretrain : array
| Extra samples used for soft pretraining of the predictor,
| should have same number of dimensions as X.
| During training the weights of these samples will decay after
| each epoch.
|
| ic50_pretrain : array
| IC50 values for extra samples, shape
|
| pretrain_decay : int -> float function
| decay function for pretraining, mapping epoch number to decay
| factor
|
| sample_weights_pretrain : array
| Initial weights for the rows of X_pretrain. If not specified then
| initialized to ones.
|
| n_random_negative_samples : int
| Number of random samples to generate as negative examples.
|
| n_training_epochs : int
|
| verbose : bool
|
| batch_size : int
|
| get_weights(self)
| Returns weights, which can be passed to set_weights later.
|
| predict_ic50_for_kmer_encoded_array(self, X)
| Given an encoded array of amino acid indices,
| returns a vector of IC50 predictions.
|
| predict_scores_for_kmer_encoded_array(self, X)
| Given an encoded array of amino acid indices, returns a vector
| of affinity scores (values between 0 and 1).
|
| set_weights(self, weights)
| Reset the model weights.
|
| ----------------------------------------------------------------------
| Data and other attributes defined here:
|
| fit_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperparamete...
|
| hyperparameter_defaults = <mhcflurry.hyperparameters.HyperparameterDef...
|
| network_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperpara...
|
| ----------------------------------------------------------------------
| Methods inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
|
| __repr__(self)
| Return repr(self).
|
| __str__(self)
| Return str(self).
|
| encode_peptides(self, peptides)
| Parameters
| ----------
| peptides : str list
| Peptide strings of any length
|
| Encode peptides of any length into fixed length vectors.
| Returns 2d array of encoded peptides and 1d array indicating the
| original peptide index for each row.
|
| fit_dataset(self, dataset, pretraining_dataset=None, sample_censored_affinities=False, **kwargs)
| Fit the model parameters on the given training data.
|
| Parameters
| ----------
| dataset : Dataset
|
| pretraining_dataset : Dataset
|
| sample_censored_affinities : bool
| If a column named 'inequality' is in the Dataset then every
| peptide with a value of '>' on each training epoch, gets a
| randomly sampled IC50 between its observed value and the
| max_ic50 of the predictor. Default is False.
|
| **kwargs : dict
| Extra arguments are passed on to the fit_encoded_kmer_arrays()
| method.
|
| predict_ic50_for_kmer_peptides(self, peptides)
|
| predict_scores(self, peptides, combine_fn=<function mean at 0x109180ae8>)
| Given a list of peptides of any length, returns an array of predicted
| normalized affinity values. Unlike IC50, a higher value here
| means a stronger affinity. Peptides of lengths other than 9 are
| transformed into a set of k-mers either by deleting or inserting
| amino acid characters. The prediction for a single peptide will be
| the average of expanded k-mers.
|
| predict_scores_for_kmer_peptides(self, peptides)
| Predict binding affinity for 9mer peptides
|
| ----------------------------------------------------------------------
| Data descriptors inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
|
| amino_acids
| Amino acid alphabet used for encoding peptides, may include
| "X" if allow_unknown_amino_acids is True.
|
| max_amino_acid_encoding_value
|
| ----------------------------------------------------------------------
| Methods inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
|
| fit_dictionary(self, peptide_to_ic50_dict, **kwargs)
| Fit the model parameters using the given peptide->IC50 dictionary,
| all samples are given the same weight.
|
| Parameters
| ----------
| peptide_to_ic50_dict : dict
| Dictionary that maps peptides to IC50 values.
|
| fit_sequences(self, peptides, affinities, sample_weights=None, alleles=None, **kwargs)
|
| predict(self, peptides)
| Predict IC50 affinities for peptides of any length
|
| ----------------------------------------------------------------------
| Data descriptors inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
%% Cell type:code id: tags:
``` python
train_data = kim2014_train.get_allele("HLA-A3301")
train_data
```
%% Output
Dataset(n=3040, alleles=['HLA-A3301'])
%% Cell type:code id: tags:
``` python
# We'll use the default hyper parameters here. Could also specify them as kwargs:
new_model = mhcflurry.class1_allele_specific.Class1BindingPredictor()
new_model.hyperparameters
```
%% Output
{'activation': 'tanh',
'batch_normalization': True,
'batch_size': 128,
'dropout_probability': 0.0,
'embedding_output_dim': 32,
'fraction_negative': 0.0,
'init': 'glorot_uniform',
'kmer_size': 9,
'layer_sizes': [64],
'loss': 'mse',
'max_ic50': 50000.0,
'n_training_epochs': 250,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
'pretrain_decay': 'numpy.exp(-epoch)'}
%% Cell type:code id: tags:
``` python
# This will run faster if you have a GPU.
%time new_model.fit_dataset(train_data)
```
%% Output
CPU times: user 1min 22s, sys: 1.13 s, total: 1min 24s
Wall time: 45.8 s
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Evaluate the fit model on held-out test data
%% Cell type:markdown id: tags:
### Generate predictions
%% Cell type:code id: tags:
``` python
test_data = kim2014_test.get_allele("HLA-A3301")
predictions = new_model.predict(test_data.peptides)
seaborn.set_context('notebook')
seaborn.regplot(numpy.log10(test_data.affinities), numpy.log10(predictions))
pyplot.xlim(xmin=0)
pyplot.ylim(ymin=0)
pyplot.xlabel("Measured affinity (log10 nM)")
pyplot.ylabel("Predicted affinity (log10 nM)")
pyplot.title("MHCflurry on test data")
```
%% Output
<matplotlib.text.Text at 0x122e50400>
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
### Calculate AUC, F1, and Kendall's Tau scores
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.scoring.make_scores)
```
%% Output
Help on function make_scores in module mhcflurry.class1_allele_specific.scoring:
make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500, max_ic50=50000)
Calculate AUC, F1, and Kendall Tau scores.
Parameters
-----------
ic50_y : float list
true IC50s (i.e. affinities)
ic50_y_pred : float list
predicted IC50s
sample_weight : float list [optional]
threshold_nm : float [optional]
max_ic50 : float [optional]
Returns
-----------
dict with entries "auc", "f1", "tau"
%% Cell type:code id: tags:
``` python
mhcflurry.class1_allele_specific.scoring.make_scores(test_data.affinities, predictions)
```
%% Output
{'auc': 0.84099099099099106,
'f1': 0.65531914893617027,
'tau': 0.43387627983717181}
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Cross validation for hyperparameter selection
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds)
```
%% Output
Help on function cross_validation_folds in module mhcflurry.class1_allele_specific.cross_validation:
cross_validation_folds(train_data, alleles=None, n_folds=3, drop_similar_peptides=False, imputer=None, impute_kwargs={'min_observations_per_allele': 2, 'min_observations_per_peptide': 2}, parallel_backend=None)
Split a Dataset into n_folds cross validation folds for each allele,
optionally performing imputation.
Parameters
-----------
train_data : mhcflurry.Dataset
alleles : string list, optional
Alleles to run cross validation on. Default: all alleles in
train_data.
n_folds : int, optional
Number of cross validation folds for each allele.
drop_similar_peptides : boolean, optional
For each fold, remove peptides from the test data that are similar
to peptides in the train data. Similarity is defined as in the
similar_peptides function.
imputer : fancyimpute.Solver, optional
Imputer to use. If not specified, no imputation is done.
impute_kwargs : dict, optional
Additional kwargs to pass to mhcflurry.Dataset.impute_missing_values.
parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
Futures implementation to use for running on multiple threads,
processes, or nodes
Returns
-----------
list of AlleleSpecificTrainTestFold of length num alleles * n_folds
%% Cell type:code id: tags:
``` python
folds = mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds(train_data)
folds
```
%% Output
[AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2026, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1014, alleles=['HLA-A3301'])),
AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301'])),
AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301']))]
%% Cell type:code id: tags:
``` python
# Take a look at what hyperparameters are available for searching over.
mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.defaults
```
%% Output
{'activation': 'tanh',
'batch_normalization': True,
'batch_size': 128,
'dropout_probability': 0.0,
'embedding_output_dim': 32,
'fraction_negative': 0.0,
'impute': False,
'init': 'glorot_uniform',
'kmer_size': 9,
'layer_sizes': [64],
'loss': 'mse',
'max_ic50': 50000.0,
'n_training_epochs': 250,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
'pretrain_decay': 'numpy.exp(-epoch)'}
%% Cell type:code id: tags:
``` python
models_to_search = mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.models_grid(
fraction_negative=[.1],
layer_sizes=[[8], [12]])
print("Searching over %d models." % len(models_to_search))
print("First model: \n%s" % models_to_search[0])
```
%% Output
Searching over 2 models.
First model:
{'output_activation': 'sigmoid', 'pretrain_decay': 'numpy.exp(-epoch)', 'n_training_epochs': 250, 'embedding_output_dim': 32, 'optimizer': 'rmsprop', 'loss': 'mse', 'fraction_negative': 0.1, 'batch_normalization': True, 'dropout_probability': 0.0, 'init': 'glorot_uniform', 'activation': 'tanh', 'batch_size': 128, 'impute': False, 'kmer_size': 9, 'max_ic50': 50000.0, 'layer_sizes': [8]}
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.train.train_across_models_and_folds)
```
%% Output
Help on function train_across_models_and_folds in module mhcflurry.class1_allele_specific.train:
train_across_models_and_folds(folds, model_descriptions, cartesian_product_of_folds_and_models=True, return_predictors=False, folds_per_task=1, parallel_backend=None)
Train and optionally test any number of models across any number of folds.
Parameters
-----------
folds : list of AlleleSpecificTrainTestFold
model_descriptions : list of dict
Models to test
cartesian_product_of_folds_and_models : boolean, optional
If true, then a predictor is treained for each fold and model
description.
If false, then len(folds) must equal len(model_descriptions), and
the i'th model is trained on the i'th fold.
return_predictors : boolean, optional
Include the trained predictors in the result.
parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
Futures implementation to use for running on multiple threads,
processes, or nodes
Returns
-----------
pandas.DataFrame
%% Cell type:code id: tags:
``` python
results_df = mhcflurry.class1_allele_specific.train.train_across_models_and_folds(
folds,
models_to_search,
return_predictors=True)
results_df
```
%% Output
allele fold_num model_num train_size test_size imputed_train_size \
0 HLA-A3301 0 0 2026 1014 None
1 HLA-A3301 0 1 2026 1014 None
2 HLA-A3301 1 0 2027 1013 None
3 HLA-A3301 1 1 2027 1013 None
4 HLA-A3301 2 0 2027 1013 None
5 HLA-A3301 2 1 2027 1013 None
train_tau train_auc train_f1 test_tau ... \
0 0.710233 0.989589 0.902256 0.429803 ...
1 0.747597 0.993938 0.919708 0.425610 ...
2 0.705507 0.990185 0.882466 0.430678 ...
3 0.745532 0.993875 0.924812 0.395103 ...
4 0.709275 0.992395 0.894531 0.441365 ...
5 0.743498 0.994674 0.873518 0.439221 ...
model_fraction_negative model_batch_normalization \
0 0.1 True
1 0.1 True
2 0.1 True
3 0.1 True
4 0.1 True
5 0.1 True
model_dropout_probability model_init model_activation \
0 0.0 glorot_uniform tanh
1 0.0 glorot_uniform tanh
2 0.0 glorot_uniform tanh
3 0.0 glorot_uniform tanh
4 0.0 glorot_uniform tanh
5 0.0 glorot_uniform tanh
model_batch_size model_impute model_kmer_size model_max_ic50 \
0 128 False 9 50000.0
1 128 False 9 50000.0
2 128 False 9 50000.0
3 128 False 9 50000.0
4 128 False 9 50000.0
5 128 False 9 50000.0
model_layer_sizes
0 [8]
1 [12]
2 [8]
3 [12]
4 [8]
5 [12]
[6 rows x 31 columns]
%% Cell type:code id: tags:
``` python
# The trained predictors are in the 'predictor' column
results_df.predictor
```
%% Output
0 Class1BindingPredictor(name=None, max_ic50=500...
1 Class1BindingPredictor(name=None, max_ic50=500...
2 Class1BindingPredictor(name=None, max_ic50=500...
3 Class1BindingPredictor(name=None, max_ic50=500...
4 Class1BindingPredictor(name=None, max_ic50=500...
5 Class1BindingPredictor(name=None, max_ic50=500...
Name: predictor, dtype: object
%% Cell type:code id: tags:
``` python
# Which model had the best average AUC across folds?
results_df.groupby("model_num").test_auc.mean()
```
%% Output
model_num
0 0.859859
1 0.847004
Name: test_auc, dtype: float64
......
......@@ -21,15 +21,22 @@ class MHCflurryReleased(PresentationComponentModel):
random_peptides_for_percent_rank : list of string
If specified, then percentile rank will be calibrated and emitted
using the given peptides.
predictor : Class1EnsembleMultiAllelePredictor-like object
Predictor to use.
"""
def __init__(
self,
experiment_to_alleles,
random_peptides_for_percent_rank=None,
predictor=None,
predictor_name="mhcflurry_released",
**kwargs):
PresentationComponentModel.__init__(self, **kwargs)
self.experiment_to_alleles = experiment_to_alleles
self.predictor = predictor
self.predictor_name = predictor_name
if random_peptides_for_percent_rank is None:
self.percent_rank_transforms = None
self.random_peptides_for_percent_rank = None
......@@ -39,9 +46,9 @@ class MHCflurryReleased(PresentationComponentModel):
random_peptides_for_percent_rank)
def column_names(self):
columns = ['mhcflurry_released_affinity']
columns = [self.predictor_name + '_affinity']
if self.percent_rank_transforms is not None:
columns.append('mhcflurry_released_percentile_rank')
columns.append(self.predictor_name + '_percentile_rank')
return columns
def requires_fitting(self):
......@@ -63,11 +70,14 @@ class MHCflurryReleased(PresentationComponentModel):
normalize_allele_name(allele)
for allele in alleles
]
df = predict(alleles, numpy.unique(numpy.array(peptides)))
df = predict(
alleles,
numpy.unique(numpy.array(peptides)),
predictor=self.predictor)
pivoted = df.pivot(index='Peptide', columns='Allele')
pivoted.columns = pivoted.columns.droplevel()
result = {
'mhcflurry_released_affinity': (
self.predictor_name + '_affinity': (
pivoted.min(axis=1).ix[peptides].values)
}
if self.percent_rank_transforms is not None:
......@@ -77,7 +87,7 @@ class MHCflurryReleased(PresentationComponentModel):
percentile_ranks[allele] = (
self.percent_rank_transforms[allele]
.transform(pivoted[allele].values))
result['mhcflurry_released_percentile_rank'] = (
result[self.predictor_name + '_percentile_rank'] = (
percentile_ranks.min(axis=1).ix[peptides].values)
return result
......
......@@ -3,7 +3,11 @@ from __future__ import absolute_import
from .class1_binding_predictor import Class1BindingPredictor
from .train import train_across_models_and_folds, AlleleSpecificTrainTestFold
from .cross_validation import cross_validation_folds
from .class1_single_model_multi_allele_predictor import from_allele_name, supported_alleles
from .class1_single_model_multi_allele_predictor import (
from_allele_name,
supported_alleles,
get_downloaded_predictor,
Class1SingleModelMultiAllelePredictor)
__all__ = [
'Class1BindingPredictor',
......@@ -12,4 +16,6 @@ __all__ = [
'train_across_models_and_folds',
'from_allele_name',
'supported_alleles',
'get_downloaded_predictor',
'Class1SingleModelMultiAllelePredictor',
]
......@@ -32,7 +32,7 @@ CACHED_PREDICTOR = None
def from_allele_name(allele_name):
"""
Load a predictor for an allele using the default loader.
Load a single-allele predictor.
Parameters
----------
......@@ -42,7 +42,7 @@ def from_allele_name(allele_name):
----------
Class1BindingPredictor
"""
return get_downloaded_predictor().from_allele_name(allele_name)
return get_downloaded_predictor().predictor_for_allele(allele_name)
def supported_alleles():
......@@ -65,7 +65,9 @@ def get_downloaded_predictor():
# different.
path = get_path("models_class1_allele_specific_single")
if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
CACHED_PREDICTOR = Class1SingleModelMultiAllelePredictor(path)
CACHED_PREDICTOR = (
Class1SingleModelMultiAllelePredictor
.load_from_download_directory(path))
return CACHED_PREDICTOR
......
from .class1_ensemble_multi_allele_predictor import (
Class1EnsembleMultiAllelePredictor,
get_downloaded_predictor,
HYPERPARAMETER_DEFAULTS)
__all__ = [
"Class1EnsembleMultiAllelePredictor",
"get_downloaded_predictor",
"HYPERPARAMETER_DEFAULTS",
]
......@@ -34,6 +34,7 @@ import pandas
from ..hyperparameters import HyperparameterDefaults
from ..class1_allele_specific import Class1BindingPredictor, scoring
from ..downloads import get_path
from .. import parallelism, common
MEASUREMENT_COLLECTION_HYPERPARAMETER_DEFAULTS = HyperparameterDefaults(
......@@ -56,6 +57,35 @@ HYPERPARAMETER_DEFAULTS = (
.extend(Class1BindingPredictor.hyperparameter_defaults))
CACHED_PREDICTOR = None
def supported_alleles():
"""
Return a list of the names of the alleles for which there are trained
predictors in the default laoder.
"""
return get_downloaded_predictor().supported_alleles
def get_downloaded_predictor():
"""
Return a Class1AlleleSpecificPredictorLoader that uses downloaded models.
"""
global CACHED_PREDICTOR
# Some of the unit tests manipulate the downloads directory configuration
# so get_path here may return different results in the same Python process.
# For this reason we check the path and invalidate the loader if it's
# different.
path = get_path("models_class1_allele_specific_ensemble")
if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
CACHED_PREDICTOR = (
Class1EnsembleMultiAllelePredictor
.load_from_download_directory(path))
return CACHED_PREDICTOR
def call_fit_and_test(args):
return fit_and_test(*args)
......
......@@ -8,7 +8,7 @@
# by name, the downloads with "default=true" are downloaded.
# This should usually be the latest release.
current-release: 0.0.8
current-release: 0.1.0
# An integer indicating what models the current MHCflurry code base is compatible
# with. Increment this integer when changes are made to MHCflurry that would break
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment