keep only class I data in curate.py

32ce51d3 · Tim O'Donnell · b651138e · 32ce51d3 · b651138e
Commit 32ce51d3 authored 7 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/curate.py
+++ b/downloads-generation/data_curated/curate.py
@@ -75,6 +75,12 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
    iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
    print("Loaded iedb data: %s" % str(iedb_df.shape))

+    print("Selecting only class I")
+    iedb_df = iedb_df.ix[
+        iedb_df["MHC allele class"].str.strip().str.upper() == "I"
+    ]
+    print("New shape: %s" % str(iedb_df.shape))
+
    print("Dropping known unusuable alleles")
    iedb_df = iedb_df.ix[
        ~iedb_df["Allele Name"].isin(EXCLUDE_IEDB_ALLELES)

--- a/test/test_affinity_measurement_dataset.py
+++ b/test/test_affinity_measurement_dataset.py
-from nose.tools import eq_
-from mhcflurry.affinity_measurement_dataset import AffinityMeasurementDataset
-
-def test_create_allele_data_from_single_allele_dict():
-    peptide_to_ic50_dict = {
-        ("A" * 10): 1.2,
-        ("C" * 9): 1000,
-    }
-    dataset = AffinityMeasurementDataset.from_single_allele_dictionary(
-        allele_name="A0201",
-        peptide_to_affinity_dict=peptide_to_ic50_dict)
-    assert isinstance(dataset, AffinityMeasurementDataset)
-
-    eq_(len(peptide_to_ic50_dict), len(dataset))
-    expected_peptides = set([
-        "A" * 10,
-        "C" * 9,
-    ])
-    for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)):
-        eq_(pi, pj)
-    for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())):
-        eq_(pi, pj)
-
-def test_dataset_random_split():
-    dataset = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {
-            "SIINFEKL": 10.0,
-            "FEKLSIIN": 20000.0,
-            "SIFEKLIN": 50000.0,
-        }})
-    left, right = dataset.random_split(n=2)
-    assert len(left) == 2
-    assert len(right) == 1
-
-def test_dataset_difference():
-    dataset1 = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {
-            "SIINFEKL": 10.0,
-            "FEKLSIIN": 20000.0,
-            "SIFEKLIN": 50000.0,
-        }})
-    dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 10.0}})
-    dataset_diff = dataset1.difference(dataset2)
-    expected_result = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {
-            "FEKLSIIN": 20000.0,
-            "SIFEKLIN": 50000.0,
-        }})
-    eq_(dataset_diff, expected_result)
-
-
-def test_dataset_intersection():
-    dataset1 = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {
-            "SIINFEKL": 10.0,
-            "FEKLSIIN": 20000.0,
-            "SIFEKLIN": 50000.0,
-        }})
-    dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 30.0}})
-    dataset_intersection = dataset1.intersection(dataset2)
-    expected_result = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {"SIINFEKL": 10.0}})
-    eq_(dataset_intersection, expected_result)
-
-def test_dataset_cross_validation():
-    dataset = AffinityMeasurementDataset.from_nested_dictionary({
-        "H-2-Kb": {
-            "SIINFEKL": 10.0,
-            "FEKLSIIN": 20000.0,
-            "SIFEKLIN": 50000.0,
-        },
-        "HLA-A*02:01": {"ASASAS": 1.0, "CCC": 0.0}})
-
-    fold_count = 0
-    for train_dataset, test_dataset in dataset.cross_validation_iterator(
-            test_allele="HLA-A*02:01",
-            n_folds=2):
-        assert train_dataset.unique_alleles() == {"H-2-Kb", "HLA-A*02:01"}
-        assert test_dataset.unique_alleles() == {"HLA-A*02:01"}
-        assert len(test_dataset) == 1
-        fold_count += 1
-    assert fold_count == 2
-
-if __name__ == "__main__":
-    test_create_allele_data_from_single_allele_dict()
-    test_dataset_random_split()
-    test_dataset_difference()