From b50dc80c6ec0874a0b4a61873506847ae9e3471b Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sun, 15 Sep 2019 21:40:41 -0400 Subject: [PATCH] Drop insufficiently specific allele names like 'HLA-A03' in data curation script. Fixes #119 --- downloads-generation/data_curated/curate.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 08994ee3..5f99ccd5 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -134,6 +134,18 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): (~iedb_df["Allele Name"].str.contains("CD1")) ] + # Drop insufficiently specific allele names like "HLA-A03": + insuffient_mask = ( + (~iedb_df["Allele Name"].str.upper().str.startswith("H2-")) & + (~iedb_df["Allele Name"].str.upper().str.startswith("H-2-")) & + (~iedb_df["Allele Name"].str.upper().str.startswith("MAMU")) & + (iedb_df["Allele Name"].str.findall("[0-9]").str.len() < 4) + ) + print("Dropping %d records with insufficiently-specific allele names:" % + insuffient_mask.sum()) + print(iedb_df.loc[insuffient_mask]["Allele Name"].value_counts()) + iedb_df = iedb_df.loc[~insuffient_mask] + iedb_df["allele"] = iedb_df["Allele Name"].map(normalize_allele_name) print("Dropping un-parseable alleles: %s" % ", ".join( iedb_df.loc[iedb_df.allele == "UNKNOWN"]["Allele Name"].unique())) -- GitLab