diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 8f03c2ccc279ac7fe0baa2714eec043744ab05b1..49eb75c50cd1827c40ad49c2d38d960101c6c43c 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -75,6 +75,12 @@ def load_data_iedb(iedb_csv, include_qualitative=True): iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False) print("Loaded iedb data: %s" % str(iedb_df.shape)) + print("Selecting only class I") + iedb_df = iedb_df.ix[ + iedb_df["MHC allele class"].str.strip().str.upper() == "I" + ] + print("New shape: %s" % str(iedb_df.shape)) + print("Dropping known unusuable alleles") iedb_df = iedb_df.ix[ ~iedb_df["Allele Name"].isin(EXCLUDE_IEDB_ALLELES) diff --git a/test/test_affinity_measurement_dataset.py b/test/test_affinity_measurement_dataset.py deleted file mode 100644 index 4ecc71d4372d6010ae68c36e4e92c8b762dba0a2..0000000000000000000000000000000000000000 --- a/test/test_affinity_measurement_dataset.py +++ /dev/null @@ -1,87 +0,0 @@ -from nose.tools import eq_ -from mhcflurry.affinity_measurement_dataset import AffinityMeasurementDataset - -def test_create_allele_data_from_single_allele_dict(): - peptide_to_ic50_dict = { - ("A" * 10): 1.2, - ("C" * 9): 1000, - } - dataset = AffinityMeasurementDataset.from_single_allele_dictionary( - allele_name="A0201", - peptide_to_affinity_dict=peptide_to_ic50_dict) - assert isinstance(dataset, AffinityMeasurementDataset) - - eq_(len(peptide_to_ic50_dict), len(dataset)) - expected_peptides = set([ - "A" * 10, - "C" * 9, - ]) - for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)): - eq_(pi, pj) - for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())): - eq_(pi, pj) - -def test_dataset_random_split(): - dataset = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": { - "SIINFEKL": 10.0, - "FEKLSIIN": 20000.0, - "SIFEKLIN": 50000.0, - }}) - left, right = dataset.random_split(n=2) - assert len(left) == 2 - assert len(right) == 1 - -def test_dataset_difference(): - dataset1 = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": { - "SIINFEKL": 10.0, - "FEKLSIIN": 20000.0, - "SIFEKLIN": 50000.0, - }}) - dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 10.0}}) - dataset_diff = dataset1.difference(dataset2) - expected_result = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": { - "FEKLSIIN": 20000.0, - "SIFEKLIN": 50000.0, - }}) - eq_(dataset_diff, expected_result) - - -def test_dataset_intersection(): - dataset1 = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": { - "SIINFEKL": 10.0, - "FEKLSIIN": 20000.0, - "SIFEKLIN": 50000.0, - }}) - dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 30.0}}) - dataset_intersection = dataset1.intersection(dataset2) - expected_result = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": {"SIINFEKL": 10.0}}) - eq_(dataset_intersection, expected_result) - -def test_dataset_cross_validation(): - dataset = AffinityMeasurementDataset.from_nested_dictionary({ - "H-2-Kb": { - "SIINFEKL": 10.0, - "FEKLSIIN": 20000.0, - "SIFEKLIN": 50000.0, - }, - "HLA-A*02:01": {"ASASAS": 1.0, "CCC": 0.0}}) - - fold_count = 0 - for train_dataset, test_dataset in dataset.cross_validation_iterator( - test_allele="HLA-A*02:01", - n_folds=2): - assert train_dataset.unique_alleles() == {"H-2-Kb", "HLA-A*02:01"} - assert test_dataset.unique_alleles() == {"HLA-A*02:01"} - assert len(test_dataset) == 1 - fold_count += 1 - assert fold_count == 2 - -if __name__ == "__main__": - test_create_allele_data_from_single_allele_dict() - test_dataset_random_split() - test_dataset_difference()