Skip to content
Snippets Groups Projects
Commit 5d0bf7d5 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

small fixes

parent 5ecb11c7
No related merge requests found
%% Cell type:code id: tags:
``` python
import mhcflurry
import numpy
import seaborn
import logging
from matplotlib import pyplot
% matplotlib inline
logging.basicConfig(level="DEBUG")
```
%% Output
Using Theano backend.
/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))
%% Cell type:markdown id: tags:
# Making predictions
Note: if you haven't already, run `mhcflurry-downloads fetch` in a shell to download the trained models.
%% Cell type:markdown id: tags:
## Simplest way to run predictions: `mhcflurry.predict()`
%% Cell type:code id: tags:
``` python
help(mhcflurry.predict)
```
%% Output
Help on function predict in module mhcflurry.predict:
predict(alleles, peptides, loaders=None)
Make predictions across all combinations of the specified alleles and
peptides.
Parameters
----------
alleles : list of str
Names of alleles to make predictions for.
peptides : list of str
Peptide amino acid sequences.
loaders : list of Class1AlleleSpecificPredictorLoader, optional
Loaders to try. Will be tried in the order given.
Returns DataFrame with columns "Allele", "Peptide", and "Prediction"
%% Cell type:code id: tags:
``` python
mhcflurry.predict(alleles=["HLA-A0201"], peptides=["SIINFEKL", "SIINFEQL"])
```
%% Output
Allele Peptide Prediction
0 HLA-A0201 SIINFEKL 10672.347656
1 HLA-A0201 SIINFEQL 7828.974121
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Instantiating a model
%% Cell type:code id: tags:
``` python
model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
```
%% Output
array([ 10672.34765625, 30577.02539062, 10565.78222656], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Instantiating a model from a custom set of models on disk
%% Cell type:code id: tags:
``` python
models_dir = mhcflurry.downloads.get_path("models_class1_allele_specific_single")
models_dir
```
%% Output
'/Users/tim/Library/Application Support/mhcflurry/4/0.0.8/models_class1_allele_specific_single/'
%% Cell type:code id: tags:
``` python
# Make a Loader first
loader = mhcflurry.class1_allele_specific.load.Class1AlleleSpecificPredictorLoader(models_dir)
model = loader.from_allele_name("HLA-A0201")
model.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"])
```
%% Output
array([ 10672.34765625, 30577.02539062, 10565.78222656], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Loading a `Dataset`
%% Cell type:code id: tags:
``` python
full_training_data = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
full_training_data
```
%% Output
Dataset(n=192550, alleles=['ELA-A1', 'Gogo-B0101', 'H-2-DB', 'H-2-DD', 'H-2-KB', 'H-2-KBM8', 'H-2-KD', 'H-2-KK', 'H-2-LD', 'H-2-LQ', 'HLA-A0101', 'HLA-A0201', 'HLA-A0202', 'HLA-A0203', 'HLA-A0204', 'HLA-A0205', 'HLA-A0206', 'HLA-A0207', 'HLA-A0210', 'HLA-A0211', 'HLA-A0212', 'HLA-A0216', 'HLA-A0217', 'HLA-A0219', 'HLA-A0250', 'HLA-A0301', 'HLA-A0302', 'HLA-A0319', 'HLA-A1', 'HLA-A11', 'HLA-A1101', 'HLA-A1102', 'HLA-A2', 'HLA-A2301', 'HLA-A24', 'HLA-A2402', 'HLA-A2403', 'HLA-A2501', 'HLA-A26', 'HLA-A2601', 'HLA-A2602', 'HLA-A2603', 'HLA-A2902', 'HLA-A3', 'HLA-A3/11', 'HLA-A3001', 'HLA-A3002', 'HLA-A3101', 'HLA-A3201', 'HLA-A3207', 'HLA-A3215', 'HLA-A3301', 'HLA-A6601', 'HLA-A6801', 'HLA-A6802', 'HLA-A6823', 'HLA-A6901', 'HLA-A7401', 'HLA-A8001', 'HLA-B0702', 'HLA-B0801', 'HLA-B0802', 'HLA-B0803', 'HLA-B1401', 'HLA-B1402', 'HLA-B1501', 'HLA-B1502', 'HLA-B1503', 'HLA-B1509', 'HLA-B1517', 'HLA-B1542', 'HLA-B1801', 'HLA-B27', 'HLA-B2701', 'HLA-B2702', 'HLA-B2703', 'HLA-B2704', 'HLA-B2705', 'HLA-B2706', 'HLA-B2710', 'HLA-B2720', 'HLA-B3501', 'HLA-B3503', 'HLA-B3508', 'HLA-B3701', 'HLA-B3801', 'HLA-B39', 'HLA-B3901', 'HLA-B40', 'HLA-B4001', 'HLA-B4002', 'HLA-B4013', 'HLA-B4201', 'HLA-B4202', 'HLA-B44', 'HLA-B4402', 'HLA-B4403', 'HLA-B4501', 'HLA-B4506', 'HLA-B4601', 'HLA-B4801', 'HLA-B51', 'HLA-B5101', 'HLA-B5201', 'HLA-B5301', 'HLA-B5401', 'HLA-B5701', 'HLA-B5702', 'HLA-B5703', 'HLA-B58', 'HLA-B5801', 'HLA-B5802', 'HLA-B60', 'HLA-B62', 'HLA-B7', 'HLA-B7301', 'HLA-B8', 'HLA-B8101', 'HLA-B8301', 'HLA-BOLA102101', 'HLA-BOLA200801', 'HLA-BOLA201201', 'HLA-BOLA402401', 'HLA-BOLA601301', 'HLA-BOLA601302', 'HLA-BOLAHD6', 'HLA-C0303', 'HLA-C0401', 'HLA-C0501', 'HLA-C0602', 'HLA-C0702', 'HLA-C0802', 'HLA-C1', 'HLA-C1203', 'HLA-C1402', 'HLA-C1502', 'HLA-C4', 'HLA-E0101', 'HLA-E0103', 'HLA-EQCA100101', 'HLA-RT1A', 'HLA-RT1BL', 'HLA-SLA10401', 'Mamu-A01', 'Mamu-A02', 'Mamu-A07', 'Mamu-A100101', 'Mamu-A100201', 'Mamu-A101101', 'Mamu-A11', 'Mamu-A20102', 'Mamu-A2201', 'Mamu-A2601', 'Mamu-A70103', 'Mamu-B01', 'Mamu-B01704', 'Mamu-B03', 'Mamu-B04', 'Mamu-B06502', 'Mamu-B08', 'Mamu-B1001', 'Mamu-B17', 'Mamu-B3901', 'Mamu-B52', 'Mamu-B6601', 'Mamu-B8301', 'Mamu-B8701', 'Patr-A0101', 'Patr-A0301', 'Patr-A0401', 'Patr-A0602', 'Patr-A0701', 'Patr-A0901', 'Patr-B0101', 'Patr-B0901', 'Patr-B1301', 'Patr-B1701', 'Patr-B2401'])
%% Cell type:code id: tags:
``` python
kim2014_full = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.20130222.mhci.public.1.txt"))
kim2014_train = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.2009.mhci.public.1.txt"))
kim2014_test = mhcflurry.dataset.Dataset.from_csv(
mhcflurry.downloads.get_path("data_kim2014", "bdata.2013.mhci.public.blind.1.txt"))
len(kim2014_full), len(kim2014_train), len(kim2014_test)
```
%% Output
(179692, 137654, 27680)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Predicting affinities from a `Dataset`
%% Cell type:code id: tags:
``` python
model = mhcflurry.class1_allele_specific.load.from_allele_name("HLA-A0201")
model.predict(kim2014_train.get_allele("HLA-A0201").peptides)
```
%% Output
array([ 3514.14550781, 12429.5390625 , 4227.02197266, ...,
5949.32763672, 17837.0859375 , 6724.96728516], dtype=float32)
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Fit a model
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.Class1BindingPredictor)
```
%% Output
Help on class Class1BindingPredictor in module mhcflurry.class1_allele_specific.class1_binding_predictor:
class Class1BindingPredictor(mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase)
| Allele-specific Class I MHC binding predictor which uses
| fixed-length (k-mer) index encoding for inputs and outputs
| a value between 0 and 1 (where 1 is the strongest binder).
|
| Method resolution order:
| Class1BindingPredictor
| mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase
| mhcflurry.ic50_predictor_base.IC50PredictorBase
| builtins.object
|
| Methods defined here:
|
| __getstate__(self)
|
| __init__(self, model=None, name=None, max_ic50=50000.0, allow_unknown_amino_acids=True, kmer_size=9, n_amino_acids=20, verbose=False, **hyperparameters)
| Initialize self. See help(type(self)) for accurate signature.
|
| __setstate__(self, state)
|
| fit_kmer_encoded_arrays(self, X, ic50, sample_weights=None, right_censoring_mask=None, X_pretrain=None, ic50_pretrain=None, sample_weights_pretrain=None, n_random_negative_samples=None, pretrain_decay=None, n_training_epochs=None, batch_size=None, verbose=False)
| Train predictive model from index encoding of fixed length k-mer
| peptides.
|
| Parameters
| ----------
| X : array
| Training data with shape (n_samples, n_dims)
|
| ic50 : array
| Training IC50 values with shape (n_samples,)
|
| sample_weights : array
| Weight of each training sample with shape (n_samples,)
|
| right_censoring_mask : array, optional
| Boolean array which indicates whether each IC50 value is actually
| right censored (a lower bound on the true value). Censored values
| are transformed during training by sampling between the observed
| and maximum values on each iteration.
|
| X_pretrain : array
| Extra samples used for soft pretraining of the predictor,
| should have same number of dimensions as X.
| During training the weights of these samples will decay after
| each epoch.
|
| ic50_pretrain : array
| IC50 values for extra samples, shape
|
| pretrain_decay : int -> float function
| decay function for pretraining, mapping epoch number to decay
| factor
|
| sample_weights_pretrain : array
| Initial weights for the rows of X_pretrain. If not specified then
| initialized to ones.
|
| n_random_negative_samples : int
| Number of random samples to generate as negative examples.
|
| n_training_epochs : int
|
| verbose : bool
|
| batch_size : int
|
| get_weights(self)
| Returns weights, which can be passed to set_weights later.
|
| predict_ic50_for_kmer_encoded_array(self, X)
| Given an encoded array of amino acid indices,
| returns a vector of IC50 predictions.
|
| predict_scores_for_kmer_encoded_array(self, X)
| Given an encoded array of amino acid indices, returns a vector
| of affinity scores (values between 0 and 1).
|
| set_weights(self, weights)
| Reset the model weights.
|
| ----------------------------------------------------------------------
| Data and other attributes defined here:
|
| fit_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperparamete...
|
| hyperparameter_defaults = <mhcflurry.hyperparameters.HyperparameterDef...
|
| network_hyperparameter_defaults = <mhcflurry.hyperparameters.Hyperpara...
|
| ----------------------------------------------------------------------
| Methods inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
|
| __repr__(self)
| Return repr(self).
|
| __str__(self)
| Return str(self).
|
| encode_peptides(self, peptides)
| Parameters
| ----------
| peptides : str list
| Peptide strings of any length
|
| Encode peptides of any length into fixed length vectors.
| Returns 2d array of encoded peptides and 1d array indicating the
| original peptide index for each row.
|
| fit_dataset(self, dataset, pretraining_dataset=None, sample_censored_affinities=False, **kwargs)
| Fit the model parameters on the given training data.
|
| Parameters
| ----------
| dataset : Dataset
|
| pretraining_dataset : Dataset
|
| sample_censored_affinities : bool
| If a column named 'inequality' is in the Dataset then every
| peptide with a value of '>' on each training epoch, gets a
| randomly sampled IC50 between its observed value and the
| max_ic50 of the predictor. Default is False.
|
| **kwargs : dict
| Extra arguments are passed on to the fit_encoded_kmer_arrays()
| method.
|
| predict_ic50_for_kmer_peptides(self, peptides)
|
| predict_scores(self, peptides, combine_fn=<function mean at 0x109180ae8>)
| Given a list of peptides of any length, returns an array of predicted
| normalized affinity values. Unlike IC50, a higher value here
| means a stronger affinity. Peptides of lengths other than 9 are
| transformed into a set of k-mers either by deleting or inserting
| amino acid characters. The prediction for a single peptide will be
| the average of expanded k-mers.
|
| predict_scores_for_kmer_peptides(self, peptides)
| Predict binding affinity for 9mer peptides
|
| ----------------------------------------------------------------------
| Data descriptors inherited from mhcflurry.class1_allele_specific.class1_allele_specific_kmer_ic50_predictor_base.Class1AlleleSpecificKmerIC50PredictorBase:
|
| amino_acids
| Amino acid alphabet used for encoding peptides, may include
| "X" if allow_unknown_amino_acids is True.
|
| max_amino_acid_encoding_value
|
| ----------------------------------------------------------------------
| Methods inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
|
| fit_dictionary(self, peptide_to_ic50_dict, **kwargs)
| Fit the model parameters using the given peptide->IC50 dictionary,
| all samples are given the same weight.
|
| Parameters
| ----------
| peptide_to_ic50_dict : dict
| Dictionary that maps peptides to IC50 values.
|
| fit_sequences(self, peptides, affinities, sample_weights=None, alleles=None, **kwargs)
|
| predict(self, peptides)
| Predict IC50 affinities for peptides of any length
|
| ----------------------------------------------------------------------
| Data descriptors inherited from mhcflurry.ic50_predictor_base.IC50PredictorBase:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
%% Cell type:code id: tags:
``` python
train_data = kim2014_train.get_allele("HLA-A3301")
train_data
```
%% Output
Dataset(n=3040, alleles=['HLA-A3301'])
%% Cell type:code id: tags:
``` python
# We'll use the default hyper parameters here. Could also specify them as kwargs:
new_model = mhcflurry.class1_allele_specific.Class1BindingPredictor()
new_model.hyperparameters
```
%% Output
{'activation': 'tanh',
'batch_normalization': True,
'batch_size': 128,
'dropout_probability': 0.0,
'embedding_output_dim': 32,
'fraction_negative': 0.0,
'init': 'glorot_uniform',
'kmer_size': 9,
'layer_sizes': [64],
'loss': 'mse',
'max_ic50': 50000.0,
'n_training_epochs': 250,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
'pretrain_decay': 'numpy.exp(-epoch)'}
%% Cell type:code id: tags:
``` python
# This will run faster if you have a GPU.
%time new_model.fit_dataset(train_data)
```
%% Output
CPU times: user 1min 22s, sys: 1.13 s, total: 1min 24s
Wall time: 45.8 s
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Evaluate the fit model on held-out test data
%% Cell type:markdown id: tags:
### Generate predictions
%% Cell type:code id: tags:
``` python
test_data = kim2014_test.get_allele("HLA-A3301")
predictions = new_model.predict(test_data.peptides)
seaborn.set_context('notebook')
seaborn.regplot(numpy.log10(test_data.affinities), numpy.log10(predictions))
pyplot.xlim(xmin=0)
pyplot.ylim(ymin=0)
pyplot.xlabel("Measured affinity (log10 nM)")
pyplot.ylabel("Predicted affinity (log10 nM)")
pyplot.title("MHCflurry on test data")
```
%% Output
<matplotlib.text.Text at 0x122e50400>
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
### Calculate AUC, F1, and Kendall's Tau scores
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.scoring.make_scores)
```
%% Output
Help on function make_scores in module mhcflurry.class1_allele_specific.scoring:
make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500, max_ic50=50000)
Calculate AUC, F1, and Kendall Tau scores.
Parameters
-----------
ic50_y : float list
true IC50s (i.e. affinities)
ic50_y_pred : float list
predicted IC50s
sample_weight : float list [optional]
threshold_nm : float [optional]
max_ic50 : float [optional]
Returns
-----------
dict with entries "auc", "f1", "tau"
%% Cell type:code id: tags:
``` python
mhcflurry.class1_allele_specific.scoring.make_scores(test_data.affinities, predictions)
```
%% Output
{'auc': 0.84099099099099106,
'f1': 0.65531914893617027,
'tau': 0.43387627983717181}
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
## Cross validation for hyperparameter selection
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds)
```
%% Output
Help on function cross_validation_folds in module mhcflurry.class1_allele_specific.cross_validation:
cross_validation_folds(train_data, alleles=None, n_folds=3, drop_similar_peptides=False, imputer=None, impute_kwargs={'min_observations_per_allele': 2, 'min_observations_per_peptide': 2}, parallel_backend=None)
Split a Dataset into n_folds cross validation folds for each allele,
optionally performing imputation.
Parameters
-----------
train_data : mhcflurry.Dataset
alleles : string list, optional
Alleles to run cross validation on. Default: all alleles in
train_data.
n_folds : int, optional
Number of cross validation folds for each allele.
drop_similar_peptides : boolean, optional
For each fold, remove peptides from the test data that are similar
to peptides in the train data. Similarity is defined as in the
similar_peptides function.
imputer : fancyimpute.Solver, optional
Imputer to use. If not specified, no imputation is done.
impute_kwargs : dict, optional
Additional kwargs to pass to mhcflurry.Dataset.impute_missing_values.
parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
Futures implementation to use for running on multiple threads,
processes, or nodes
Returns
-----------
list of AlleleSpecificTrainTestFold of length num alleles * n_folds
%% Cell type:code id: tags:
``` python
folds = mhcflurry.class1_allele_specific.cross_validation.cross_validation_folds(train_data)
folds
```
%% Output
[AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2026, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1014, alleles=['HLA-A3301'])),
AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301'])),
AlleleSpecificTrainTestFold(allele='HLA-A3301', train=Dataset(n=2027, alleles=['HLA-A3301']), imputed_train=None, test=Dataset(n=1013, alleles=['HLA-A3301']))]
%% Cell type:code id: tags:
``` python
# Take a look at what hyperparameters are available for searching over.
mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.defaults
```
%% Output
{'activation': 'tanh',
'batch_normalization': True,
'batch_size': 128,
'dropout_probability': 0.0,
'embedding_output_dim': 32,
'fraction_negative': 0.0,
'impute': False,
'init': 'glorot_uniform',
'kmer_size': 9,
'layer_sizes': [64],
'loss': 'mse',
'max_ic50': 50000.0,
'n_training_epochs': 250,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
'pretrain_decay': 'numpy.exp(-epoch)'}
%% Cell type:code id: tags:
``` python
models_to_search = mhcflurry.class1_allele_specific.train.HYPERPARAMETER_DEFAULTS.models_grid(
fraction_negative=[.1],
layer_sizes=[[8], [12]])
print("Searching over %d models." % len(models_to_search))
print("First model: \n%s" % models_to_search[0])
```
%% Output
Searching over 2 models.
First model:
{'output_activation': 'sigmoid', 'pretrain_decay': 'numpy.exp(-epoch)', 'n_training_epochs': 250, 'embedding_output_dim': 32, 'optimizer': 'rmsprop', 'loss': 'mse', 'fraction_negative': 0.1, 'batch_normalization': True, 'dropout_probability': 0.0, 'init': 'glorot_uniform', 'activation': 'tanh', 'batch_size': 128, 'impute': False, 'kmer_size': 9, 'max_ic50': 50000.0, 'layer_sizes': [8]}
%% Cell type:code id: tags:
``` python
help(mhcflurry.class1_allele_specific.train.train_across_models_and_folds)
```
%% Output
Help on function train_across_models_and_folds in module mhcflurry.class1_allele_specific.train:
train_across_models_and_folds(folds, model_descriptions, cartesian_product_of_folds_and_models=True, return_predictors=False, folds_per_task=1, parallel_backend=None)
Train and optionally test any number of models across any number of folds.
Parameters
-----------
folds : list of AlleleSpecificTrainTestFold
model_descriptions : list of dict
Models to test
cartesian_product_of_folds_and_models : boolean, optional
If true, then a predictor is treained for each fold and model
description.
If false, then len(folds) must equal len(model_descriptions), and
the i'th model is trained on the i'th fold.
return_predictors : boolean, optional
Include the trained predictors in the result.
parallel_backend : mhcflurry.parallelism.ParallelBackend, optional
Futures implementation to use for running on multiple threads,
processes, or nodes
Returns
-----------
pandas.DataFrame
%% Cell type:code id: tags:
``` python
results_df = mhcflurry.class1_allele_specific.train.train_across_models_and_folds(
folds,
models_to_search,
return_predictors=True)
results_df
```
%% Output
allele fold_num model_num train_size test_size imputed_train_size \
0 HLA-A3301 0 0 2026 1014 None
1 HLA-A3301 0 1 2026 1014 None
2 HLA-A3301 1 0 2027 1013 None
3 HLA-A3301 1 1 2027 1013 None
4 HLA-A3301 2 0 2027 1013 None
5 HLA-A3301 2 1 2027 1013 None
train_tau train_auc train_f1 test_tau ... \
0 0.710233 0.989589 0.902256 0.429803 ...
1 0.747597 0.993938 0.919708 0.425610 ...
2 0.705507 0.990185 0.882466 0.430678 ...
3 0.745532 0.993875 0.924812 0.395103 ...
4 0.709275 0.992395 0.894531 0.441365 ...
5 0.743498 0.994674 0.873518 0.439221 ...
model_fraction_negative model_batch_normalization \
0 0.1 True
1 0.1 True
2 0.1 True
3 0.1 True
4 0.1 True
5 0.1 True
model_dropout_probability model_init model_activation \
0 0.0 glorot_uniform tanh
1 0.0 glorot_uniform tanh
2 0.0 glorot_uniform tanh
3 0.0 glorot_uniform tanh
4 0.0 glorot_uniform tanh
5 0.0 glorot_uniform tanh
model_batch_size model_impute model_kmer_size model_max_ic50 \
0 128 False 9 50000.0
1 128 False 9 50000.0
2 128 False 9 50000.0
3 128 False 9 50000.0
4 128 False 9 50000.0
5 128 False 9 50000.0
model_layer_sizes
0 [8]
1 [12]
2 [8]
3 [12]
4 [8]
5 [12]
[6 rows x 31 columns]
%% Cell type:code id: tags:
``` python
# The trained predictors are in the 'predictor' column
results_df.predictor
```
%% Output
0 Class1BindingPredictor(name=None, max_ic50=500...
1 Class1BindingPredictor(name=None, max_ic50=500...
2 Class1BindingPredictor(name=None, max_ic50=500...
3 Class1BindingPredictor(name=None, max_ic50=500...
4 Class1BindingPredictor(name=None, max_ic50=500...
5 Class1BindingPredictor(name=None, max_ic50=500...
Name: predictor, dtype: object
%% Cell type:code id: tags:
``` python
# Which model had the best average AUC across folds?
results_df.groupby("model_num").test_auc.mean()
```
%% Output
model_num
0 0.859859
1 0.847004
Name: test_auc, dtype: float64
......
......@@ -21,15 +21,22 @@ class MHCflurryReleased(PresentationComponentModel):
random_peptides_for_percent_rank : list of string
If specified, then percentile rank will be calibrated and emitted
using the given peptides.
predictor : Class1EnsembleMultiAllelePredictor-like object
Predictor to use.
"""
def __init__(
self,
experiment_to_alleles,
random_peptides_for_percent_rank=None,
predictor=None,
predictor_name="mhcflurry_released",
**kwargs):
PresentationComponentModel.__init__(self, **kwargs)
self.experiment_to_alleles = experiment_to_alleles
self.predictor = predictor
self.predictor_name = predictor_name
if random_peptides_for_percent_rank is None:
self.percent_rank_transforms = None
self.random_peptides_for_percent_rank = None
......@@ -39,9 +46,9 @@ class MHCflurryReleased(PresentationComponentModel):
random_peptides_for_percent_rank)
def column_names(self):
columns = ['mhcflurry_released_affinity']
columns = [self.predictor_name + '_affinity']
if self.percent_rank_transforms is not None:
columns.append('mhcflurry_released_percentile_rank')
columns.append(self.predictor_name + '_percentile_rank')
return columns
def requires_fitting(self):
......@@ -63,11 +70,14 @@ class MHCflurryReleased(PresentationComponentModel):
normalize_allele_name(allele)
for allele in alleles
]
df = predict(alleles, numpy.unique(numpy.array(peptides)))
df = predict(
alleles,
numpy.unique(numpy.array(peptides)),
predictor=self.predictor)
pivoted = df.pivot(index='Peptide', columns='Allele')
pivoted.columns = pivoted.columns.droplevel()
result = {
'mhcflurry_released_affinity': (
self.predictor_name + '_affinity': (
pivoted.min(axis=1).ix[peptides].values)
}
if self.percent_rank_transforms is not None:
......@@ -77,7 +87,7 @@ class MHCflurryReleased(PresentationComponentModel):
percentile_ranks[allele] = (
self.percent_rank_transforms[allele]
.transform(pivoted[allele].values))
result['mhcflurry_released_percentile_rank'] = (
result[self.predictor_name + '_percentile_rank'] = (
percentile_ranks.min(axis=1).ix[peptides].values)
return result
......
......@@ -3,7 +3,11 @@ from __future__ import absolute_import
from .class1_binding_predictor import Class1BindingPredictor
from .train import train_across_models_and_folds, AlleleSpecificTrainTestFold
from .cross_validation import cross_validation_folds
from .class1_single_model_multi_allele_predictor import from_allele_name, supported_alleles
from .class1_single_model_multi_allele_predictor import (
from_allele_name,
supported_alleles,
get_downloaded_predictor,
Class1SingleModelMultiAllelePredictor)
__all__ = [
'Class1BindingPredictor',
......@@ -12,4 +16,6 @@ __all__ = [
'train_across_models_and_folds',
'from_allele_name',
'supported_alleles',
'get_downloaded_predictor',
'Class1SingleModelMultiAllelePredictor',
]
......@@ -32,7 +32,7 @@ CACHED_PREDICTOR = None
def from_allele_name(allele_name):
"""
Load a predictor for an allele using the default loader.
Load a single-allele predictor.
Parameters
----------
......@@ -42,7 +42,7 @@ def from_allele_name(allele_name):
----------
Class1BindingPredictor
"""
return get_downloaded_predictor().from_allele_name(allele_name)
return get_downloaded_predictor().predictor_for_allele(allele_name)
def supported_alleles():
......@@ -65,7 +65,9 @@ def get_downloaded_predictor():
# different.
path = get_path("models_class1_allele_specific_single")
if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
CACHED_PREDICTOR = Class1SingleModelMultiAllelePredictor(path)
CACHED_PREDICTOR = (
Class1SingleModelMultiAllelePredictor
.load_from_download_directory(path))
return CACHED_PREDICTOR
......
from .class1_ensemble_multi_allele_predictor import (
Class1EnsembleMultiAllelePredictor,
get_downloaded_predictor,
HYPERPARAMETER_DEFAULTS)
__all__ = [
"Class1EnsembleMultiAllelePredictor",
"get_downloaded_predictor",
"HYPERPARAMETER_DEFAULTS",
]
......@@ -34,6 +34,7 @@ import pandas
from ..hyperparameters import HyperparameterDefaults
from ..class1_allele_specific import Class1BindingPredictor, scoring
from ..downloads import get_path
from .. import parallelism, common
MEASUREMENT_COLLECTION_HYPERPARAMETER_DEFAULTS = HyperparameterDefaults(
......@@ -56,6 +57,35 @@ HYPERPARAMETER_DEFAULTS = (
.extend(Class1BindingPredictor.hyperparameter_defaults))
CACHED_PREDICTOR = None
def supported_alleles():
"""
Return a list of the names of the alleles for which there are trained
predictors in the default laoder.
"""
return get_downloaded_predictor().supported_alleles
def get_downloaded_predictor():
"""
Return a Class1AlleleSpecificPredictorLoader that uses downloaded models.
"""
global CACHED_PREDICTOR
# Some of the unit tests manipulate the downloads directory configuration
# so get_path here may return different results in the same Python process.
# For this reason we check the path and invalidate the loader if it's
# different.
path = get_path("models_class1_allele_specific_ensemble")
if CACHED_PREDICTOR is None or path != CACHED_PREDICTOR.path:
CACHED_PREDICTOR = (
Class1EnsembleMultiAllelePredictor
.load_from_download_directory(path))
return CACHED_PREDICTOR
def call_fit_and_test(args):
return fit_and_test(*args)
......
......@@ -8,7 +8,7 @@
# by name, the downloads with "default=true" are downloaded.
# This should usually be the latest release.
current-release: 0.0.8
current-release: 0.1.0
# An integer indicating what models the current MHCflurry code base is compatible
# with. Increment this integer when changes are made to MHCflurry that would break
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment