Skip to content
Snippets Groups Projects
Commit 7b7075f0 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

added test for Dataset

parent bc0ec2c3
No related branches found
No related tags found
No related merge requests found
......@@ -37,7 +37,8 @@ from .serialization_helpers import (
from .peptide_encoding import check_valid_index_encoding_array
from .feedforward_hyperparameters import LOSS, OPTIMIZER
from .regression_target import MAX_IC50
from .training_helpers import check_training_data_shapes
from .dataset import Dataset
from .training_helpers import combine_training_arrays
_allele_predictor_cache = {}
......@@ -133,70 +134,6 @@ class Class1BindingPredictor(PredictorBase):
kmer_size=peptide_length,
**kwargs)
def _combine_training_data(
self,
X,
Y,
sample_weights,
X_pretrain,
Y_pretrain,
sample_weights_pretrain,
verbose=False):
"""
Make sure the shapes of given training and pre-training data
conform with each other. Then concatenate the pre-training and the
training data.
Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples)
"""
X = np.asarray(X)
Y = np.asarray(Y)
if sample_weights is None:
sample_weights = np.ones_like(Y)
else:
sample_weights = np.asarray(sample_weights)
n_samples, n_dims = check_training_data_shapes(X, Y, sample_weights)
if X_pretrain is None or Y_pretrain is None:
X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
Y_pretrain = np.empty((0,), dtype=Y.dtype)
else:
X_pretrain = np.asarray(X_pretrain)
Y_pretrain = np.asarray(Y_pretrain)
if sample_weights_pretrain is None:
sample_weights_pretrain = np.ones_like(Y_pretrain)
else:
sample_weights_pretrain = np.asarray(sample_weights_pretrain)
n_pretrain_samples, n_pretrain_dims = check_training_data_shapes(
X_pretrain, Y_pretrain, sample_weights_pretrain)
if Y.min() < 0:
raise ValueError("Minimum value of Y can't be negative, got %f" % (
Y.min()))
if Y.max() > 1:
raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
Y.max()))
if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
Y.min()))
if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
Y.max()))
X_combined = np.vstack([X_pretrain, X])
Y_combined = np.concatenate([Y_pretrain, Y])
combined_weights = np.concatenate([
sample_weights_pretrain,
sample_weights,
])
return X_combined, Y_combined, combined_weights, n_pretrain_samples
def _extend_with_negative_random_samples(
self, X, Y, weights, n_random_negative_samples):
"""
......@@ -240,7 +177,7 @@ class Class1BindingPredictor(PredictorBase):
assert len(weights_with_negative) == len(weights) + n_random_negative_samples
return X_with_negative, Y_with_negative, weights_with_negative
def fit(
def fit_kmer_encoded_arrays(
self,
X,
Y,
......@@ -294,10 +231,9 @@ class Class1BindingPredictor(PredictorBase):
batch_size : int
"""
X_combined, Y_combined, combined_weights, n_pretrain = \
self._combine_training_data(
combine_training_arrays(
X, Y, sample_weights,
X_pretrain, Y_pretrain, sample_weights_pretrain,
verbose=verbose)
X_pretrain, Y_pretrain, sample_weights_pretrain)
total_pretrain_sample_weight = combined_weights[:n_pretrain].sum()
total_train_sample_weight = combined_weights[n_pretrain:].sum()
......@@ -426,7 +362,7 @@ class Class1BindingPredictor(PredictorBase):
def __str__(self):
return repr(self)
def predict(self, X):
def predict_from_kmer_encoding(self, X):
"""
Given an encoded array of amino acid indices, returns a vector
of predicted log IC50 values.
......
......@@ -28,13 +28,16 @@ from .amino_acid import (
common_amino_acids
)
from .regression_target import regression_target_to_ic50, MAX_IC50
from .dataset import Dataset
class PredictorBase(object):
"""
Base class for all mhcflurry predictors (including the Ensemble class)
Base class for all mhcflurry predictors which used fixed-length
k-mer representation of peptides. Eventually will need to move this code
to something like FixedLengthPredictor to fit RNN-based sequence
predictors into the inheritance hierarchy.
"""
def __init__(
self,
name,
......@@ -103,21 +106,7 @@ class PredictorBase(object):
scores = self.predict_kmer_peptides(peptides)
return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
def predict_peptides_ic50(self, peptides):
"""
Predict IC50 affinities for peptides of any length
"""
scores = self.predict_peptides(peptides)
return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
def predict(self, X):
raise ValueError("Method 'predict' not yet implemented for %s!" % (
self.__class__.__name__,))
def predict_peptides(
self,
peptides,
combine_fn=np.mean):
def predict_scores(self, peptides, combine_fn=np.mean):
"""
Given a list of peptides of any length, returns an array of predicted
normalized affinity values. Unlike IC50, a higher value here
......@@ -144,3 +133,59 @@ class PredictorBase(object):
for (p, ys) in multiple_predictions_dict.items()
}
return np.array([combined_predictions_dict[p] for p in peptides])
def predict(self, peptides):
"""
Predict IC50 affinities for peptides of any length
"""
scores = self.predict_peptides(peptides)
return regression_target_to_ic50(scores, max_ic50=self.max_ic50)
def fit_dictionary(self, peptide_to_ic50_dict, **kwargs):
"""
Fit the model parameters using the given peptide->IC50 dictionary,
all samples are given the same weight.
Parameters
----------
peptide_to_ic50_dict : dict
Dictionary that maps peptides to IC50 values.
"""
dataset = Dataset.from_peptide_to_affinity_dictionary(
allele_name=self.name,
peptide_to_affinity_dict=peptide_to_ic50_dict)
return self.fit_dataset(dataset, **kwargs)
def fit_dataset(self, dataset, pretraining_dataset=None, *args, **kwargs):
"""
Fit the model parameters on the given training data.
Parameters
----------
dataset : Dataset
pretraining_dataset : Dataset
**kwargs : dict
Extra arguments are passed on to the fit_encoded_kmer_arrays()
method.
"""
X, Y, sample_weights = dataset.encode()
X_pretrain, Y_pretrain, sample_weights_pretrain = pretraining_dataset.encode()
return self.fit_arrays(
X=X,
Y=Y,
sample_weights=sample_weights,
X_pretrain=X_pretrain,
Y_pretrain=Y_pretrain,
sample_weights_pretrain=sample_weights,
**kwargs)
def fit_sequences(self, peptides, affinities, sample_weights=None, **kwargs):
alleles = [self.name] * len(peptides)
dataset = Dataset.from_sequences(
alleles=alleles,
peptides=peptides,
affinities=affinities,
sample_weights=sample_weights)
return self.fit_dataset(dataset, **kwargs)
from nose.tools import eq_
from mhcflurry.data import (
create_allele_data_from_peptide_to_ic50_dict,
AlleleData
)
def test_create_allele_data_from_peptide_to_ic50_dict():
peptide_to_ic50_dict = {
("A" * 10): 1.2,
("C" * 9): 1000,
}
allele_data = create_allele_data_from_peptide_to_ic50_dict(
peptide_to_ic50_dict,
max_ic50=50000.0)
assert isinstance(allele_data, AlleleData)
expected_peptides = set([
"A" * 9,
"C" * 9,
])
peptides = set(allele_data.peptides)
eq_(expected_peptides, peptides)
from nose.tools import eq_
from mhcflurry.dataset import Dataset
def test_create_allele_data_from_single_allele_dict():
peptide_to_ic50_dict = {
("A" * 10): 1.2,
("C" * 9): 1000,
}
dataset = Dataset.from_peptide_to_affinity_dictionary(
allele_name="A0201",
peptide_to_affinity_dict=peptide_to_ic50_dict)
assert isinstance(dataset, Dataset)
eq_(len(peptide_to_ic50_dict), len(dataset))
expected_peptides = set([
"A" * 10,
"C" * 9,
])
for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)):
eq_(pi, pj)
for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())):
eq_(pi, pj)
if __name__ == "__main__":
test_create_allele_data_from_single_allele_dict()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment