From 3b83c2bdc90e64d8b6e311cc0ffcedd876f71cee Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Thu, 23 Jun 2016 18:36:24 -0400 Subject: [PATCH] filter modified peptides from generated dataset --- mhcflurry/package_metadata.py | 2 +- script/create-combined-class1-dataset.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/mhcflurry/package_metadata.py b/mhcflurry/package_metadata.py index ebb027f3..b6154148 100644 --- a/mhcflurry/package_metadata.py +++ b/mhcflurry/package_metadata.py @@ -1,2 +1,2 @@ -__version__ = "0.0.6" +__version__ = "0.0.7" diff --git a/script/create-combined-class1-dataset.py b/script/create-combined-class1-dataset.py index fbfad25a..b75fbca6 100755 --- a/script/create-combined-class1-dataset.py +++ b/script/create-combined-class1-dataset.py @@ -168,6 +168,15 @@ if __name__ == "__main__": combined_df = pd.DataFrame( combined_columns, columns=["species", "mhc", "peptide", "peptide_length", "meas"]) + + # filter out post-translation modifications and peptides with unknown + # residues + modified_peptide_mask = combined_df.peptide.str.contains("+") + n_modified = modified_peptide_mask.sum() + if n_modified > 0: + print("Dropping %d modified peptides" % n_modified) + combined_df = combined_df[~modified_peptide_mask] + print("New entry allele distribution") for (allele, count) in new_allele_counts.most_common(): print("%s: %d" % (allele, count)) -- GitLab