Skip to content
Snippets Groups Projects
Commit 32ce51d3 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

keep only class I data in curate.py

parent b651138e
No related merge requests found
......@@ -75,6 +75,12 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
print("Loaded iedb data: %s" % str(iedb_df.shape))
print("Selecting only class I")
iedb_df = iedb_df.ix[
iedb_df["MHC allele class"].str.strip().str.upper() == "I"
]
print("New shape: %s" % str(iedb_df.shape))
print("Dropping known unusuable alleles")
iedb_df = iedb_df.ix[
~iedb_df["Allele Name"].isin(EXCLUDE_IEDB_ALLELES)
......
from nose.tools import eq_
from mhcflurry.affinity_measurement_dataset import AffinityMeasurementDataset
def test_create_allele_data_from_single_allele_dict():
peptide_to_ic50_dict = {
("A" * 10): 1.2,
("C" * 9): 1000,
}
dataset = AffinityMeasurementDataset.from_single_allele_dictionary(
allele_name="A0201",
peptide_to_affinity_dict=peptide_to_ic50_dict)
assert isinstance(dataset, AffinityMeasurementDataset)
eq_(len(peptide_to_ic50_dict), len(dataset))
expected_peptides = set([
"A" * 10,
"C" * 9,
])
for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)):
eq_(pi, pj)
for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())):
eq_(pi, pj)
def test_dataset_random_split():
dataset = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {
"SIINFEKL": 10.0,
"FEKLSIIN": 20000.0,
"SIFEKLIN": 50000.0,
}})
left, right = dataset.random_split(n=2)
assert len(left) == 2
assert len(right) == 1
def test_dataset_difference():
dataset1 = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {
"SIINFEKL": 10.0,
"FEKLSIIN": 20000.0,
"SIFEKLIN": 50000.0,
}})
dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 10.0}})
dataset_diff = dataset1.difference(dataset2)
expected_result = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {
"FEKLSIIN": 20000.0,
"SIFEKLIN": 50000.0,
}})
eq_(dataset_diff, expected_result)
def test_dataset_intersection():
dataset1 = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {
"SIINFEKL": 10.0,
"FEKLSIIN": 20000.0,
"SIFEKLIN": 50000.0,
}})
dataset2 = AffinityMeasurementDataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 30.0}})
dataset_intersection = dataset1.intersection(dataset2)
expected_result = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {"SIINFEKL": 10.0}})
eq_(dataset_intersection, expected_result)
def test_dataset_cross_validation():
dataset = AffinityMeasurementDataset.from_nested_dictionary({
"H-2-Kb": {
"SIINFEKL": 10.0,
"FEKLSIIN": 20000.0,
"SIFEKLIN": 50000.0,
},
"HLA-A*02:01": {"ASASAS": 1.0, "CCC": 0.0}})
fold_count = 0
for train_dataset, test_dataset in dataset.cross_validation_iterator(
test_allele="HLA-A*02:01",
n_folds=2):
assert train_dataset.unique_alleles() == {"H-2-Kb", "HLA-A*02:01"}
assert test_dataset.unique_alleles() == {"HLA-A*02:01"}
assert len(test_dataset) == 1
fold_count += 1
assert fold_count == 2
if __name__ == "__main__":
test_create_allele_data_from_single_allele_dict()
test_dataset_random_split()
test_dataset_difference()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment