diff --git a/.travis.yml b/.travis.yml index 4a81675a84c60db8eb5fbebf3a24d893b18c021f..5e15afcd37249cb69df8e3aa2408c3c69085289a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,14 +39,17 @@ script: # download training data - script/download-kim-2013-dataset.sh - script/download-iedb.sh - # only install data for A*01:01, A*02:01, HLA-A*02:05 for testing - - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 + # only install data for A*01:01, A*02:01, HLA-A*02:05, HLA-B*07:02, H-2-KD + # for testing + - > + script/create-iedb-class1-dataset.py + --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 HLA-B*07:02 H-2-KD - script/create-combined-class1-dataset.py - # only installing A0101, A0201, A0205 for testing purposes and with very limited - # training + # only installing A0101, A0201, A0205, B0702, H-2-Kd for testing purposes and + # with very limited training - > mhcflurry-train-class1-allele-specific-models.py - --alleles HLA-A0101 HLA-A0201 HLA-A0205 + --alleles HLA-A0101 HLA-A0201 HLA-A0205 HLA-B0702 H-2-KD --embedding-size 10 --hidden-layer-size 10 --training-epochs 100 diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py index 86575221a034059857f4038b1f8ccfbf6207dd85..b6a42d2878ec2418cca490bdcb11c33f4c10f82b 100644 --- a/mhcflurry/dataset.py +++ b/mhcflurry/dataset.py @@ -124,10 +124,11 @@ class Dataset(object): Two datasets are equal if they contain the same number of samples with the same properties and values. """ - if len(self) != len(other): + if type(other) is not Dataset: return False - - if len(self.columns) != len(other.columns): + elif len(self) != len(other): + return False + elif len(self.columns) != len(other.columns): return False for ci, cj in zip(self.columns, other.columns): @@ -161,14 +162,18 @@ class Dataset(object): def groupby_allele(self): """ - Returns a dictionary mapping each allele name to a Dataset containing - just that allele's data. + Yields a sequence of tuples of allele names with Datasets containing + entries just for that allele. + """ + for (allele_name, group_df) in self.to_dataframe().groupby("allele"): + yield (allele_name, Dataset(group_df)) + + def groupby_allele_dictionary(self): + """ + Returns dictionary mapping each allele name to a Dataset containing + only entries from that allele. """ - return { - allele_name: Dataset(group_df) - for (allele_name, group_df) - in self.to_dataframe().groupby("allele") - } + return dict(self.groupby_allele()) def to_nested_dictionary(self, combine_fn=geometric_mean): """ @@ -187,7 +192,9 @@ class Dataset(object): # dictionary mapping each peptide to a list of affinities peptide_to_affinity_dict = defaultdict(list) peptide_to_weight_dict = defaultdict(list) - for _, peptide, affinity, sample_weight in allele_dataset.itertuples(): + for (allele, peptide), row in allele_dataset.iterrows(): + affinity = row["affinity"] + sample_weight = row["sample_weight"] peptide_to_affinity_dict[peptide].append(affinity) peptide_to_weight_dict[peptide].append(sample_weight) allele_to_peptide_to_affinity_dict[allele] = { @@ -196,6 +203,7 @@ class Dataset(object): peptide_to_weight_dict[peptide]) for peptide in peptide_to_affinity_dict.keys() } + return allele_to_peptide_to_affinity_dict @classmethod def from_sequences( @@ -254,7 +262,8 @@ class Dataset(object): @classmethod def from_nested_dictionary( - cls, allele_to_peptide_to_affinity_dict): + cls, + allele_to_peptide_to_affinity_dict): """ Given nested dictionaries mapping allele -> peptide -> affinity, construct a Dataset with uniform sample weights. @@ -272,6 +281,13 @@ class Dataset(object): peptides=peptides, affinities=affinities) + @classmethod + def create_empty(cls): + """ + Returns an empty Dataset containing no pMHC entries. + """ + return cls.from_nested_dictionary({}) + @classmethod def from_single_allele_dictionary( cls, @@ -281,8 +297,7 @@ class Dataset(object): Given a peptide->affinity dictionary for a single allele, create a Dataset. """ - return cls.from_allele_to_peptide_to_affinity_dictionary( - {allele_name: peptide_to_affinity_dict}) + return cls.from_nested_dictionary({allele_name: peptide_to_affinity_dict}) @classmethod def from_csv( @@ -424,11 +439,18 @@ class Dataset(object): allow_unknown_amino_acids=allow_unknown_amino_acids) original_peptide_indices = np.asarray(original_peptide_indices) counts = np.asarray(counts) - assert len(original_peptide_indices) == len(self.affinities) - assert len(counts) == len(self.affinities) kmer_affinities = self.affinities[original_peptide_indices] - sample_weights = 1.0 / counts - return X_index, kmer_affinities, sample_weights, original_peptide_indices + kmer_sample_weights = self.sample_weights[original_peptide_indices] + + assert len(original_peptide_indices) == len(kmer_affinities) + assert len(counts) == len(kmer_affinities) + assert len(kmer_sample_weights) == len(kmer_affinities) + + # combine the original sample weights of varying length peptides + # with a 1/n_kmers factor for the number of kmers pulled out of each + # original peptide + combined_sample_weights = kmer_sample_weights * (1.0 / counts) + return X_index, kmer_affinities, combined_sample_weights, original_peptide_indices def to_dense_pMHC_affinity_matrix(self): """ @@ -442,11 +464,14 @@ class Dataset(object): n_peptides = len(peptides_list) alleles_list = list(sorted(self.unique_alleles())) allele_order = {a: i for (i, a) in enumerate(alleles_list)} - n_alleles = alleles_list - X = np.ones((n_peptides, n_alleles)) + n_alleles = len(alleles_list) + shape = (n_peptides, n_alleles) + X = np.ones(shape, dtype=float) for (allele, allele_dict) in allele_to_peptide_to_affinity_dict.items(): + print(allele, allele_dict) column_index = allele_order[allele] for (peptide, affinity) in allele_dict.items(): + print(peptide, affinity) row_index = peptide_order[peptide] X[row_index, column_index] = affinity return X, peptides_list, alleles_list diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py index bbccb6812b82f18ea5152d953ec88d6b7b472302..c9bf9c6e95243824b3b055e1b65a52634476fd98 100644 --- a/mhcflurry/imputation.py +++ b/mhcflurry/imputation.py @@ -138,6 +138,7 @@ def create_imputed_datasets( Returns dictionary mapping allele names to AlleleData objects containing imputed affinity values. """ + assert isinstance(dataset, Dataset), "Expected Dataset, got %s" % (type(dataset),) X_incomplete, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix() _check_dense_pMHC_array(X_incomplete, peptide_list, allele_list) diff --git a/test/test_dataset.py b/test/test_dataset.py index da551d20cb954268a6072872aada54bfefb18908..29dd612151494ba20780725ea132cb7753f9541a 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -6,7 +6,7 @@ def test_create_allele_data_from_single_allele_dict(): ("A" * 10): 1.2, ("C" * 9): 1000, } - dataset = Dataset.from_peptide_to_affinity_dictionary( + dataset = Dataset.from_single_allele_dictionary( allele_name="A0201", peptide_to_affinity_dict=peptide_to_ic50_dict) assert isinstance(dataset, Dataset) diff --git a/test/test_imputation.py b/test/test_imputation.py index 3ebb66d891bda4420bcaf4783bfbf7ca35ed9a24..308fd5f85346b14e3cba543d17209c7929d1b637 100644 --- a/test/test_imputation.py +++ b/test/test_imputation.py @@ -1,3 +1,4 @@ + from mhcflurry.imputation import ( create_imputed_datasets, imputer_from_name, @@ -8,14 +9,17 @@ from mhcflurry import Class1BindingPredictor from fancyimpute import MICE, KNN, SoftImpute, IterativeSVD from nose.tools import eq_ -import numpy as np + def test_create_imputed_datasets_empty(): - result = create_imputed_datasets({}, imputer=MICE(n_imputations=25)) - eq_(result, {}) + empty_dataset = Dataset.create_empty() + result = create_imputed_datasets( + empty_dataset, + imputer=MICE(n_imputations=25)) + eq_(result, empty_dataset) def test_create_imputed_datasets_two_alleles(): - allele_to_peptide_to_affinity_dict = { + dataset = Dataset.from_nested_dictionary({ "HLA-A*02:01": { "A" * 9: 20.0, "C" * 9: 40000.0, @@ -24,13 +28,11 @@ def test_create_imputed_datasets_two_alleles(): "S" * 9: 500.0, "A" * 9: 25.0, }, - } - dataset = Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict) - result = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25)) - eq_(set(result.keys()), {"HLA-A*02:01", "HLA-A*02:05"}) + }) + imputed_dataset = create_imputed_datasets(dataset, imputer=MICE(n_imputations=25)) + eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"}) expected_peptides = {"A" * 9, "C" * 9, "S" * 9} - for allele_name, allele_data in result.items(): - print(allele_name, allele_data.peptides) + for allele_name, allele_data in imputed_dataset.groupby_allele(): eq_(set(allele_data.peptides), expected_peptides) def test_performance_improves_for_A0205_with_pretraining(): @@ -39,23 +41,14 @@ def test_performance_improves_for_A0205_with_pretraining(): dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH) print("Available alleles: %s" % (set(dataset.unique_alleles()),)) - - # restrict to just three alleles - dataset = dataset.get_alleles(["HLA-A0205", "HLA-A0201", "HLA-A0101"]) - + # restrict to just five alleles + dataset = dataset.get_alleles([ + "HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]) a0205_data_without_imputation = dataset.get_allele("HLA-A0205") predictor_without_imputation = \ Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute") - - print("Without imputation, # samples = %d, # original peptides = %d" % ( - len(a0205_data_without_imputation.peptides), - len(set(a0205_data_without_imputation.original_peptides)))) - - print(set(a0205_data_without_imputation.original_peptides)) - X_index = a0205_data_without_imputation.X_index - Y_true = a0205_data_without_imputation.Y - sample_weights = a0205_data_without_imputation.weights - + X_index, Y_true, sample_weights, _ = \ + a0205_data_without_imputation.kmer_index_encoding() predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, Y=Y_true, @@ -64,20 +57,14 @@ def test_performance_improves_for_A0205_with_pretraining(): Y_pred_without_imputation = \ predictor_without_imputation.predict_scores_for_kmer_array(X_index) - print("Y_pred without imputation: %s" % (Y_pred_without_imputation,)) - mse_without_imputation = np.mean((Y_true - Y_pred_without_imputation) ** 2) - print("MSE w/out imputation: %f" % mse_without_imputation) + diff_squared = (Y_true - Y_pred_without_imputation) ** 2 + mse_without_imputation = (diff_squared * sample_weights) / sample_weights.sum() - imputed_data_dict = create_imputed_datasets( - allele_data_dict, MICE(n_imputations=25)) - a0205_data_with_imputation = imputed_data_dict["HLA-A0205"] - print("Imputed data, # samples = %d, # original peptides = %d" % ( - len(a0205_data_with_imputation.peptides), - len(set(a0205_data_with_imputation.original_peptides)))) + imputed_datset = dataset.impute(MICE(n_imputations=25)) + a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") - X_index_imputed = a0205_data_with_imputation.X_index - Y_imputed = a0205_data_with_imputation.Y - sample_weights_imputed = a0205_data_with_imputation.weights + X_index_imputed, Y_imputed, sample_weights_imputed, _ = \ + a0205_data_with_imputation.kmer_index_encoding() predictor_with_imputation = \ Class1BindingPredictor.from_hyperparameters(name="A0205-impute") @@ -93,7 +80,9 @@ def test_performance_improves_for_A0205_with_pretraining(): Y_pred_with_imputation = \ predictor_with_imputation.predict_scores_for_kmer_array(X_index) - mse_with_imputation = np.mean((Y_true - Y_pred_with_imputation) ** 2) + + diff_squared = (Y_true - Y_pred_with_imputation) ** 2 + mse_with_imputation = (diff_squared * sample_weights_imputed) / sample_weights_imputed.sum() print("MSE w/ imputation: %f" % (mse_with_imputation,)) assert mse_with_imputation < mse_without_imputation, \