Skip to content
Snippets Groups Projects
Commit b50dc80c authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Drop insufficiently specific allele names like 'HLA-A03' in data curation script. Fixes #119

parent b910d1e7
No related branches found
No related tags found
No related merge requests found
...@@ -134,6 +134,18 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): ...@@ -134,6 +134,18 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False):
(~iedb_df["Allele Name"].str.contains("CD1")) (~iedb_df["Allele Name"].str.contains("CD1"))
] ]
# Drop insufficiently specific allele names like "HLA-A03":
insuffient_mask = (
(~iedb_df["Allele Name"].str.upper().str.startswith("H2-")) &
(~iedb_df["Allele Name"].str.upper().str.startswith("H-2-")) &
(~iedb_df["Allele Name"].str.upper().str.startswith("MAMU")) &
(iedb_df["Allele Name"].str.findall("[0-9]").str.len() < 4)
)
print("Dropping %d records with insufficiently-specific allele names:" %
insuffient_mask.sum())
print(iedb_df.loc[insuffient_mask]["Allele Name"].value_counts())
iedb_df = iedb_df.loc[~insuffient_mask]
iedb_df["allele"] = iedb_df["Allele Name"].map(normalize_allele_name) iedb_df["allele"] = iedb_df["Allele Name"].map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join( print("Dropping un-parseable alleles: %s" % ", ".join(
iedb_df.loc[iedb_df.allele == "UNKNOWN"]["Allele Name"].unique())) iedb_df.loc[iedb_df.allele == "UNKNOWN"]["Allele Name"].unique()))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment