From 13adffc644560b66f763905bce7d45a0537878b7 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 1 Oct 2019 17:14:18 -0400
Subject: [PATCH] switch to chr1

---
 .../GENERATE.WITH_HPC_CLUSTER.sh                    | 13 +++++++------
 .../write_proteome_peptides.py                      | 11 +++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
index 74de3546..d1fe18a9 100755
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
@@ -35,9 +35,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references)
 python write_proteome_peptides.py \
     "$PEPTIDES" \
     "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
-    --out proteome_peptides.csv
-ls -lh proteome_peptides.csv
-bzip2 proteome_peptides.csv
+    --chromosome 1 \
+    --out proteome_peptides.chr1.csv
 
 python write_allele_list.py "$PEPTIDES" --out alleles.txt
 
@@ -46,12 +45,12 @@ mkdir predictions
 for kind in with_mass_spec no_mass_spec
 do
     python run_mhcflurry.py \
-        proteome_peptides.csv.bz2 \
-        --chunk-size 1000000 \
+        proteome_peptides.chr1.csv \
+        --chunk-size 100000 \
         --batch-size 65536 \
         --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
         --allele $(cat alleles.txt) \
-        --out "predictions/mhcflurry.$kind" \
+        --out "predictions/chr1.mhcflurry.$kind" \
         --verbosity 1 \
         --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
         --cluster-parallelism \
@@ -61,6 +60,8 @@ do
         --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
 done
 
+bzip2 proteome_peptides.chr1.csv
+
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
 RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
diff --git a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
index e18daf9e..29f9f728 100644
--- a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
+++ b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
@@ -27,6 +27,11 @@ parser.add_argument(
     "--out",
     metavar="OUT.csv",
     help="Out file path")
+parser.add_argument(
+    "--chromosome",
+    metavar="CHR",
+    nargs="+",
+    help="Use only proteins from the specified chromosome(s)")
 parser.add_argument(
     "--debug-max-rows",
     metavar="N",
@@ -62,6 +67,12 @@ def run():
     df = df.loc[~df.protein_ensembl_primary.isnull()]
     print("After: ", len(df))
 
+    if args.chromosome:
+        print("Subselecting to chromosome(s): ", *args.chromosome)
+        print("Before: ", len(df))
+        df = df.loc[df.protein_primary_ensembl_contig.isin(args.chromosome)]
+        print("After: ", len(df))
+
     (flanking_length,) = list(
         set(df.n_flank.str.len().unique()).union(
             set(df.n_flank.str.len().unique())))
-- 
GitLab