From 3b83c2bdc90e64d8b6e311cc0ffcedd876f71cee Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 23 Jun 2016 18:36:24 -0400
Subject: [PATCH] filter modified peptides from generated dataset

---
 mhcflurry/package_metadata.py            | 2 +-
 script/create-combined-class1-dataset.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/mhcflurry/package_metadata.py b/mhcflurry/package_metadata.py
index ebb027f3..b6154148 100644
--- a/mhcflurry/package_metadata.py
+++ b/mhcflurry/package_metadata.py
@@ -1,2 +1,2 @@
 
-__version__ = "0.0.6"
+__version__ = "0.0.7"
diff --git a/script/create-combined-class1-dataset.py b/script/create-combined-class1-dataset.py
index fbfad25a..b75fbca6 100755
--- a/script/create-combined-class1-dataset.py
+++ b/script/create-combined-class1-dataset.py
@@ -168,6 +168,15 @@ if __name__ == "__main__":
     combined_df = pd.DataFrame(
         combined_columns,
         columns=["species", "mhc", "peptide", "peptide_length", "meas"])
+
+    # filter out post-translation modifications and peptides with unknown
+    # residues
+    modified_peptide_mask = combined_df.peptide.str.contains("+")
+    n_modified = modified_peptide_mask.sum()
+    if n_modified > 0:
+        print("Dropping %d modified peptides" % n_modified)
+        combined_df = combined_df[~modified_peptide_mask]
+
     print("New entry allele distribution")
     for (allele, count) in new_allele_counts.most_common():
         print("%s: %d" % (allele, count))
-- 
GitLab