From ba9360750a7481cf8345c663cdb7c22b944da7f6 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 18 Jan 2020 17:09:03 -0500
Subject: [PATCH] fix

---
 downloads-generation/models_class1_cleavage/GENERATE.sh    | 2 +-
 .../models_class1_cleavage/make_train_data.py              | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/downloads-generation/models_class1_cleavage/GENERATE.sh b/downloads-generation/models_class1_cleavage/GENERATE.sh
index 730ed339..2a27b160 100755
--- a/downloads-generation/models_class1_cleavage/GENERATE.sh
+++ b/downloads-generation/models_class1_cleavage/GENERATE.sh
@@ -95,7 +95,7 @@ else
         --predictions "$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.mhcflurry.combined" \
         --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
         --ppv-multiplier 100 \
-        --hit-multiplier-to-take 1 \
+        --hit-multiplier-to-take 2 \
         --out "$(pwd)/train_data.csv"
     bzip2 -f train_data.csv
 fi
diff --git a/downloads-generation/models_class1_cleavage/make_train_data.py b/downloads-generation/models_class1_cleavage/make_train_data.py
index 0b3e361c..9200f824 100644
--- a/downloads-generation/models_class1_cleavage/make_train_data.py
+++ b/downloads-generation/models_class1_cleavage/make_train_data.py
@@ -80,9 +80,7 @@ def load_predictions(dirname, result_df=None, columns=None):
 
     manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
 
-    print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
-
-    for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
+    for _, row in manifest_df.iterrows():
         with open(os.path.join(dirname, row.path), "rb") as fd:
             value = numpy.load(fd)['arr_0']
             if mask is not None:
@@ -109,6 +107,9 @@ def run():
     print("Subselected to %d monoallelic samples" % hit_df.sample_id.nunique())
     hit_df["allele"] = hit_df.hla
 
+    hit_df = hit_df.loc[hit_df.allele.str.match("^HLA-[ABC]")]
+    print("Subselected to %d HLA-A/B/C samples" % hit_df.sample_id.nunique())
+
     if args.exclude_contig:
         new_hit_df = hit_df.loc[
             hit_df.protein_primary_ensembl_contig.astype(str) !=
-- 
GitLab