Skip to content
Snippets Groups Projects
Commit bb35d247 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

debugging imputation with Dataset

parent 14705f3f
No related merge requests found
......@@ -39,14 +39,17 @@ script:
# download training data
- script/download-kim-2013-dataset.sh
- script/download-iedb.sh
# only install data for A*01:01, A*02:01, HLA-A*02:05 for testing
- script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05
# only install data for A*01:01, A*02:01, HLA-A*02:05, HLA-B*07:02, H-2-KD
# for testing
- >
script/create-iedb-class1-dataset.py
--alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 HLA-B*07:02 H-2-KD
- script/create-combined-class1-dataset.py
# only installing A0101, A0201, A0205 for testing purposes and with very limited
# training
# only installing A0101, A0201, A0205, B0702, H-2-Kd for testing purposes and
# with very limited training
- >
mhcflurry-train-class1-allele-specific-models.py
--alleles HLA-A0101 HLA-A0201 HLA-A0205
--alleles HLA-A0101 HLA-A0201 HLA-A0205 HLA-B0702 H-2-KD
--embedding-size 10
--hidden-layer-size 10
--training-epochs 100
......
......@@ -124,10 +124,11 @@ class Dataset(object):
Two datasets are equal if they contain the same number of samples
with the same properties and values.
"""
if len(self) != len(other):
if type(other) is not Dataset:
return False
if len(self.columns) != len(other.columns):
elif len(self) != len(other):
return False
elif len(self.columns) != len(other.columns):
return False
for ci, cj in zip(self.columns, other.columns):
......@@ -161,14 +162,18 @@ class Dataset(object):
def groupby_allele(self):
"""
Returns a dictionary mapping each allele name to a Dataset containing
just that allele's data.
Yields a sequence of tuples of allele names with Datasets containing
entries just for that allele.
"""
for (allele_name, group_df) in self.to_dataframe().groupby("allele"):
yield (allele_name, Dataset(group_df))
def groupby_allele_dictionary(self):
"""
Returns dictionary mapping each allele name to a Dataset containing
only entries from that allele.
"""
return {
allele_name: Dataset(group_df)
for (allele_name, group_df)
in self.to_dataframe().groupby("allele")
}
return dict(self.groupby_allele())
def to_nested_dictionary(self, combine_fn=geometric_mean):
"""
......@@ -187,7 +192,9 @@ class Dataset(object):
# dictionary mapping each peptide to a list of affinities
peptide_to_affinity_dict = defaultdict(list)
peptide_to_weight_dict = defaultdict(list)
for _, peptide, affinity, sample_weight in allele_dataset.itertuples():
for (allele, peptide), row in allele_dataset.iterrows():
affinity = row["affinity"]
sample_weight = row["sample_weight"]
peptide_to_affinity_dict[peptide].append(affinity)
peptide_to_weight_dict[peptide].append(sample_weight)
allele_to_peptide_to_affinity_dict[allele] = {
......@@ -196,6 +203,7 @@ class Dataset(object):
peptide_to_weight_dict[peptide])
for peptide in peptide_to_affinity_dict.keys()
}
return allele_to_peptide_to_affinity_dict
@classmethod
def from_sequences(
......@@ -254,7 +262,8 @@ class Dataset(object):
@classmethod
def from_nested_dictionary(
cls, allele_to_peptide_to_affinity_dict):
cls,
allele_to_peptide_to_affinity_dict):
"""
Given nested dictionaries mapping allele -> peptide -> affinity,
construct a Dataset with uniform sample weights.
......@@ -272,6 +281,13 @@ class Dataset(object):
peptides=peptides,
affinities=affinities)
@classmethod
def create_empty(cls):
"""
Returns an empty Dataset containing no pMHC entries.
"""
return cls.from_nested_dictionary({})
@classmethod
def from_single_allele_dictionary(
cls,
......@@ -281,8 +297,7 @@ class Dataset(object):
Given a peptide->affinity dictionary for a single allele,
create a Dataset.
"""
return cls.from_allele_to_peptide_to_affinity_dictionary(
{allele_name: peptide_to_affinity_dict})
return cls.from_nested_dictionary({allele_name: peptide_to_affinity_dict})
@classmethod
def from_csv(
......@@ -424,11 +439,18 @@ class Dataset(object):
allow_unknown_amino_acids=allow_unknown_amino_acids)
original_peptide_indices = np.asarray(original_peptide_indices)
counts = np.asarray(counts)
assert len(original_peptide_indices) == len(self.affinities)
assert len(counts) == len(self.affinities)
kmer_affinities = self.affinities[original_peptide_indices]
sample_weights = 1.0 / counts
return X_index, kmer_affinities, sample_weights, original_peptide_indices
kmer_sample_weights = self.sample_weights[original_peptide_indices]
assert len(original_peptide_indices) == len(kmer_affinities)
assert len(counts) == len(kmer_affinities)
assert len(kmer_sample_weights) == len(kmer_affinities)
# combine the original sample weights of varying length peptides
# with a 1/n_kmers factor for the number of kmers pulled out of each
# original peptide
combined_sample_weights = kmer_sample_weights * (1.0 / counts)
return X_index, kmer_affinities, combined_sample_weights, original_peptide_indices
def to_dense_pMHC_affinity_matrix(self):
"""
......@@ -442,11 +464,14 @@ class Dataset(object):
n_peptides = len(peptides_list)
alleles_list = list(sorted(self.unique_alleles()))
allele_order = {a: i for (i, a) in enumerate(alleles_list)}
n_alleles = alleles_list
X = np.ones((n_peptides, n_alleles))
n_alleles = len(alleles_list)
shape = (n_peptides, n_alleles)
X = np.ones(shape, dtype=float)
for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items():
print(allele, allele_dict)
column_index = allele_order[allele]
for (peptide, affinity) in allele_dict.items():
print(peptide, affinity)
row_index = peptide_order[peptide]
X[row_index, column_index] = affinity
return X, peptides_list, alleles_list
......
......@@ -138,6 +138,7 @@ def create_imputed_datasets(
Returns dictionary mapping allele names to AlleleData objects containing
imputed affinity values.
"""
assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),)
X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix()
_check_dense_pMHC_array(X_incomplete, peptide_list, allele_list)
......
......@@ -6,7 +6,7 @@ def test_create_allele_data_from_single_allele_dict():
("A" * 10): 1.2,
("C" * 9): 1000,
}
dataset = Dataset.from_peptide_to_affinity_dictionary(
dataset = Dataset.from_single_allele_dictionary(
allele_name="A0201",
peptide_to_affinity_dict=peptide_to_ic50_dict)
assert isinstance(dataset, Dataset)
......
from mhcflurry.imputation import (
create_imputed_datasets,
imputer_from_name,
......@@ -8,14 +9,17 @@ from mhcflurry import Class1BindingPredictor
from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD
from nose.tools import eq_
import numpy as np
def test_create_imputed_datasets_empty():
result = create_imputed_datasets({}, imputer=MICE(n_imputations=25))
eq_(result, {})
empty_dataset = Dataset.create_empty()
result = create_imputed_datasets(
empty_dataset,
imputer=MICE(n_imputations=25))
eq_(result, empty_dataset)
def test_create_imputed_datasets_two_alleles():
allele_to_peptide_to_affinity_dict = {
dataset = Dataset.from_nested_dictionary({
"HLA-A*02:01": {
"A" * 9: 20.0,
"C" * 9: 40000.0,
......@@ -24,13 +28,11 @@ def test_create_imputed_datasets_two_alleles():
"S" * 9: 500.0,
"A" * 9: 25.0,
},
}
dataset = Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
result = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"})
})
imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
for allele_name, allele_data in result.items():
print(allele_name, allele_data.peptides)
for allele_name, allele_data in imputed_dataset.groupby_allele():
eq_(set(allele_data.peptides), expected_peptides)
def test_performance_improves_for_A0205_with_pretraining():
......@@ -39,23 +41,14 @@ def test_performance_improves_for_A0205_with_pretraining():
dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
print("Available alleles: %s" % (set(dataset.unique_alleles()),))
# restrict to just three alleles
dataset = dataset.get_alleles(["HLA-A0205", "HLA-A0201", "HLA-A0101"])
# restrict to just five alleles
dataset = dataset.get_alleles([
"HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"])
a0205_data_without_imputation = dataset.get_allele("HLA-A0205")
predictor_without_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
print("Without imputation, # samples = %d, # original peptides = %d" % (
len(a0205_data_without_imputation.peptides),
len(set(a0205_data_without_imputation.original_peptides))))
print(set(a0205_data_without_imputation.original_peptides))
X_index = a0205_data_without_imputation.X_index
Y_true = a0205_data_without_imputation.Y
sample_weights = a0205_data_without_imputation.weights
X_index, Y_true, sample_weights, _ = \
a0205_data_without_imputation.kmer_index_encoding()
predictor_without_imputation.fit_kmer_encoded_arrays(
X=X_index,
Y=Y_true,
......@@ -64,20 +57,14 @@ def test_performance_improves_for_A0205_with_pretraining():
Y_pred_without_imputation = \
predictor_without_imputation.predict_scores_for_kmer_array(X_index)
print("Y_pred without imputation: %s" % (Y_pred_without_imputation,))
mse_without_imputation = np.mean((Y_true - Y_pred_without_imputation) ** 2)
print("MSE w/out imputation: %f" % mse_without_imputation)
diff_squared = (Y_true - Y_pred_without_imputation) ** 2
mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum()
imputed_data_dict = create_imputed_datasets(
allele_data_dict, MICE(n_imputations=25))
a0205_data_with_imputation = imputed_data_dict["HLA-A0205"]
print("Imputed data, # samples = %d, # original peptides = %d" % (
len(a0205_data_with_imputation.peptides),
len(set(a0205_data_with_imputation.original_peptides))))
imputed_datset = dataset.impute(MICE(n_imputations=25))
a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
X_index_imputed = a0205_data_with_imputation.X_index
Y_imputed = a0205_data_with_imputation.Y
sample_weights_imputed = a0205_data_with_imputation.weights
X_index_imputed, Y_imputed, sample_weights_imputed, _ = \
a0205_data_with_imputation.kmer_index_encoding()
predictor_with_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-impute")
......@@ -93,7 +80,9 @@ def test_performance_improves_for_A0205_with_pretraining():
Y_pred_with_imputation = \
predictor_with_imputation.predict_scores_for_kmer_array(X_index)
mse_with_imputation = np.mean((Y_true - Y_pred_with_imputation) ** 2)
diff_squared = (Y_true - Y_pred_with_imputation) ** 2
mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum()
print("MSE w/ imputation: %f" % (mse_with_imputation,))
assert mse_with_imputation < mse_without_imputation, \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment