diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh index 74de35464282d94610f2fc19190a05ada11aa4ae..d1fe18a9e936ae6ee6698e935832bc7c8c366dae 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh @@ -35,9 +35,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references) python write_proteome_peptides.py \ "$PEPTIDES" \ "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ - --out proteome_peptides.csv -ls -lh proteome_peptides.csv -bzip2 proteome_peptides.csv + --chromosome 1 \ + --out proteome_peptides.chr1.csv python write_allele_list.py "$PEPTIDES" --out alleles.txt @@ -46,12 +45,12 @@ mkdir predictions for kind in with_mass_spec no_mass_spec do python run_mhcflurry.py \ - proteome_peptides.csv.bz2 \ - --chunk-size 1000000 \ + proteome_peptides.chr1.csv \ + --chunk-size 100000 \ --batch-size 65536 \ --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ --allele $(cat alleles.txt) \ - --out "predictions/mhcflurry.$kind" \ + --out "predictions/chr1.mhcflurry.$kind" \ --verbosity 1 \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --cluster-parallelism \ @@ -61,6 +60,8 @@ do --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf done +bzip2 proteome_peptides.chr1.csv + cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" diff --git a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py index e18daf9ee5d2dd70779e1aed166344c2cc9aa543..29f9f728f58960846c42797f37bb3e7890bb26b5 100644 --- a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py +++ b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py @@ -27,6 +27,11 @@ parser.add_argument( "--out", metavar="OUT.csv", help="Out file path") +parser.add_argument( + "--chromosome", + metavar="CHR", + nargs="+", + help="Use only proteins from the specified chromosome(s)") parser.add_argument( "--debug-max-rows", metavar="N", @@ -62,6 +67,12 @@ def run(): df = df.loc[~df.protein_ensembl_primary.isnull()] print("After: ", len(df)) + if args.chromosome: + print("Subselecting to chromosome(s): ", *args.chromosome) + print("Before: ", len(df)) + df = df.loc[df.protein_primary_ensembl_contig.isin(args.chromosome)] + print("After: ", len(df)) + (flanking_length,) = list( set(df.n_flank.str.len().unique()).union( set(df.n_flank.str.len().unique())))