Skip to content
Snippets Groups Projects
Commit bb35d247 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

debugging imputation with Dataset

parent 14705f3f
No related branches found
No related tags found
No related merge requests found
......@@ -39,14 +39,17 @@ script:
# download training data
- script/download-kim-2013-dataset.sh
- script/download-iedb.sh
# only install data for A*01:01, A*02:01, HLA-A*02:05 for testing
- script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05
# only install data for A*01:01, A*02:01, HLA-A*02:05, HLA-B*07:02, H-2-KD
# for testing
- >
script/create-iedb-class1-dataset.py
--alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 HLA-B*07:02 H-2-KD
- script/create-combined-class1-dataset.py
# only installing A0101, A0201, A0205 for testing purposes and with very limited
# training
# only installing A0101, A0201, A0205, B0702, H-2-Kd for testing purposes and
# with very limited training
- >
mhcflurry-train-class1-allele-specific-models.py
--alleles HLA-A0101 HLA-A0201 HLA-A0205
--alleles HLA-A0101 HLA-A0201 HLA-A0205 HLA-B0702 H-2-KD
--embedding-size 10
--hidden-layer-size 10
--training-epochs 100
......
......@@ -124,10 +124,11 @@ class Dataset(object):
Two datasets are equal if they contain the same number of samples
with the same properties and values.
"""
if len(self) != len(other):
if type(other) is not Dataset:
return False
if len(self.columns) != len(other.columns):
elif len(self) != len(other):
return False
elif len(self.columns) != len(other.columns):
return False
for ci, cj in zip(self.columns, other.columns):
......@@ -161,14 +162,18 @@ class Dataset(object):
def groupby_allele(self):
"""
Returns a dictionary mapping each allele name to a Dataset containing
just that allele's data.
Yields a sequence of tuples of allele names with Datasets containing
entries just for that allele.
"""
for (allele_name, group_df) in self.to_dataframe().groupby("allele"):
yield (allele_name, Dataset(group_df))
def groupby_allele_dictionary(self):
"""
Returns dictionary mapping each allele name to a Dataset containing
only entries from that allele.
"""
return {
allele_name: Dataset(group_df)
for (allele_name, group_df)
in self.to_dataframe().groupby("allele")
}
return dict(self.groupby_allele())
def to_nested_dictionary(self, combine_fn=geometric_mean):
"""
......@@ -187,7 +192,9 @@ class Dataset(object):
# dictionary mapping each peptide to a list of affinities
peptide_to_affinity_dict = defaultdict(list)
peptide_to_weight_dict = defaultdict(list)
for _, peptide, affinity, sample_weight in allele_dataset.itertuples():
for (allele, peptide), row in allele_dataset.iterrows():
affinity = row["affinity"]
sample_weight = row["sample_weight"]
peptide_to_affinity_dict[peptide].append(affinity)
peptide_to_weight_dict[peptide].append(sample_weight)
allele_to_peptide_to_affinity_dict[allele] = {
......@@ -196,6 +203,7 @@ class Dataset(object):
peptide_to_weight_dict[peptide])
for peptide in peptide_to_affinity_dict.keys()
}
return allele_to_peptide_to_affinity_dict
@classmethod
def from_sequences(
......@@ -254,7 +262,8 @@ class Dataset(object):
@classmethod
def from_nested_dictionary(
cls, allele_to_peptide_to_affinity_dict):
cls,
allele_to_peptide_to_affinity_dict):
"""
Given nested dictionaries mapping allele -> peptide -> affinity,
construct a Dataset with uniform sample weights.
......@@ -272,6 +281,13 @@ class Dataset(object):
peptides=peptides,
affinities=affinities)
@classmethod
def create_empty(cls):
"""
Returns an empty Dataset containing no pMHC entries.
"""
return cls.from_nested_dictionary({})
@classmethod
def from_single_allele_dictionary(
cls,
......@@ -281,8 +297,7 @@ class Dataset(object):
Given a peptide->affinity dictionary for a single allele,
create a Dataset.
"""
return cls.from_allele_to_peptide_to_affinity_dictionary(
{allele_name: peptide_to_affinity_dict})
return cls.from_nested_dictionary({allele_name: peptide_to_affinity_dict})
@classmethod
def from_csv(
......@@ -424,11 +439,18 @@ class Dataset(object):
allow_unknown_amino_acids=allow_unknown_amino_acids)
original_peptide_indices = np.asarray(original_peptide_indices)
counts = np.asarray(counts)
assert len(original_peptide_indices) == len(self.affinities)
assert len(counts) == len(self.affinities)
kmer_affinities = self.affinities[original_peptide_indices]
sample_weights = 1.0 / counts
return X_index, kmer_affinities, sample_weights, original_peptide_indices
kmer_sample_weights = self.sample_weights[original_peptide_indices]
assert len(original_peptide_indices) == len(kmer_affinities)
assert len(counts) == len(kmer_affinities)
assert len(kmer_sample_weights) == len(kmer_affinities)
# combine the original sample weights of varying length peptides
# with a 1/n_kmers factor for the number of kmers pulled out of each
# original peptide
combined_sample_weights = kmer_sample_weights * (1.0 / counts)
return X_index, kmer_affinities, combined_sample_weights, original_peptide_indices
def to_dense_pMHC_affinity_matrix(self):
"""
......@@ -442,11 +464,14 @@ class Dataset(object):
n_peptides = len(peptides_list)
alleles_list = list(sorted(self.unique_alleles()))
allele_order = {a: i for (i, a) in enumerate(alleles_list)}
n_alleles = alleles_list
X = np.ones((n_peptides, n_alleles))
n_alleles = len(alleles_list)
shape = (n_peptides, n_alleles)
X = np.ones(shape, dtype=float)
for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items():
print(allele, allele_dict)
column_index = allele_order[allele]
for (peptide, affinity) in allele_dict.items():
print(peptide, affinity)
row_index = peptide_order[peptide]
X[row_index, column_index] = affinity
return X, peptides_list, alleles_list
......
......@@ -138,6 +138,7 @@ def create_imputed_datasets(
Returns dictionary mapping allele names to AlleleData objects containing
imputed affinity values.
"""
assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),)
X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix()
_check_dense_pMHC_array(X_incomplete, peptide_list, allele_list)
......
......@@ -6,7 +6,7 @@ def test_create_allele_data_from_single_allele_dict():
("A" * 10): 1.2,
("C" * 9): 1000,
}
dataset = Dataset.from_peptide_to_affinity_dictionary(
dataset = Dataset.from_single_allele_dictionary(
allele_name="A0201",
peptide_to_affinity_dict=peptide_to_ic50_dict)
assert isinstance(dataset, Dataset)
......
from mhcflurry.imputation import (
create_imputed_datasets,
imputer_from_name,
......@@ -8,14 +9,17 @@ from mhcflurry import Class1BindingPredictor
from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD
from nose.tools import eq_
import numpy as np
def test_create_imputed_datasets_empty():
result = create_imputed_datasets({}, imputer=MICE(n_imputations=25))
eq_(result, {})
empty_dataset = Dataset.create_empty()
result = create_imputed_datasets(
empty_dataset,
imputer=MICE(n_imputations=25))
eq_(result, empty_dataset)
def test_create_imputed_datasets_two_alleles():
allele_to_peptide_to_affinity_dict = {
dataset = Dataset.from_nested_dictionary({
"HLA-A*02:01": {
"A" * 9: 20.0,
"C" * 9: 40000.0,
......@@ -24,13 +28,11 @@ def test_create_imputed_datasets_two_alleles():
"S" * 9: 500.0,
"A" * 9: 25.0,
},
}
dataset = Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
result = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"})
})
imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25))
eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
for allele_name, allele_data in result.items():
print(allele_name, allele_data.peptides)
for allele_name, allele_data in imputed_dataset.groupby_allele():
eq_(set(allele_data.peptides), expected_peptides)
def test_performance_improves_for_A0205_with_pretraining():
......@@ -39,23 +41,14 @@ def test_performance_improves_for_A0205_with_pretraining():
dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
print("Available alleles: %s" % (set(dataset.unique_alleles()),))
# restrict to just three alleles
dataset = dataset.get_alleles(["HLA-A0205", "HLA-A0201", "HLA-A0101"])
# restrict to just five alleles
dataset = dataset.get_alleles([
"HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"])
a0205_data_without_imputation = dataset.get_allele("HLA-A0205")
predictor_without_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
print("Without imputation, # samples = %d, # original peptides = %d" % (
len(a0205_data_without_imputation.peptides),
len(set(a0205_data_without_imputation.original_peptides))))
print(set(a0205_data_without_imputation.original_peptides))
X_index = a0205_data_without_imputation.X_index
Y_true = a0205_data_without_imputation.Y
sample_weights = a0205_data_without_imputation.weights
X_index, Y_true, sample_weights, _ = \
a0205_data_without_imputation.kmer_index_encoding()
predictor_without_imputation.fit_kmer_encoded_arrays(
X=X_index,
Y=Y_true,
......@@ -64,20 +57,14 @@ def test_performance_improves_for_A0205_with_pretraining():
Y_pred_without_imputation = \
predictor_without_imputation.predict_scores_for_kmer_array(X_index)
print("Y_pred without imputation: %s" % (Y_pred_without_imputation,))
mse_without_imputation = np.mean((Y_true - Y_pred_without_imputation) ** 2)
print("MSE w/out imputation: %f" % mse_without_imputation)
diff_squared = (Y_true - Y_pred_without_imputation) ** 2
mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum()
imputed_data_dict = create_imputed_datasets(
allele_data_dict, MICE(n_imputations=25))
a0205_data_with_imputation = imputed_data_dict["HLA-A0205"]
print("Imputed data, # samples = %d, # original peptides = %d" % (
len(a0205_data_with_imputation.peptides),
len(set(a0205_data_with_imputation.original_peptides))))
imputed_datset = dataset.impute(MICE(n_imputations=25))
a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
X_index_imputed = a0205_data_with_imputation.X_index
Y_imputed = a0205_data_with_imputation.Y
sample_weights_imputed = a0205_data_with_imputation.weights
X_index_imputed, Y_imputed, sample_weights_imputed, _ = \
a0205_data_with_imputation.kmer_index_encoding()
predictor_with_imputation = \
Class1BindingPredictor.from_hyperparameters(name="A0205-impute")
......@@ -93,7 +80,9 @@ def test_performance_improves_for_A0205_with_pretraining():
Y_pred_with_imputation = \
predictor_with_imputation.predict_scores_for_kmer_array(X_index)
mse_with_imputation = np.mean((Y_true - Y_pred_with_imputation) ** 2)
diff_squared = (Y_true - Y_pred_with_imputation) ** 2
mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum()
print("MSE w/ imputation: %f" % (mse_with_imputation,))
assert mse_with_imputation < mse_without_imputation, \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment