diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh index e5851c1ea15f6e106a121edd26e115807bc37199..6fd9539030e657a7ead2ebf2c6e46480f868c3c4 100755 --- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh +++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh @@ -35,7 +35,6 @@ python annotate.py \ "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ "${REFERENCES_DIR}/uniprot_proteins.fm" \ --out annotated_ms.csv - bzip2 annotated_ms.csv cp $SCRIPT_ABSOLUTE_PATH . diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..46e3804e84e1b8c463cebc8f9f0edb930e6137a0 --- /dev/null +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# +set -e +set -x + +DOWNLOAD_NAME=data_mass_spec_benchmark +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") +export PYTHONUNBUFFERED=1 + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +cp $SCRIPT_DIR/write_proteome_peptides.py . + +PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/all_sequences.csv.bz2 +REFERENCES_DIR=$(mhcflurry-downloads path data_references) + +python write_proteome_peptides.py \ + "$PEPTIDES" \ + "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ + --out proteome_peptides.csv +ls -lh proteome_peptides.csv +bzip2 proteome_peptides.csv + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py new file mode 100644 index 0000000000000000000000000000000000000000..76fa9bb2e63c96a7ad8fb3e2f863818e1225e515 --- /dev/null +++ b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py @@ -0,0 +1,127 @@ +""" +""" +import sys +import argparse +import os +import time +import collections +import re +from six.moves import StringIO + +import pandas +import tqdm # progress bar +tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 + +import shellinford + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "input", + metavar="FILE.csv", + help="CSV of annotated mass spec hits") +parser.add_argument( + "reference_csv", + metavar="FILE.csv", + help="CSV of protein sequences") +parser.add_argument( + "--out", + metavar="OUT.csv", + help="Out file path") +parser.add_argument( + "--debug-max-rows", + metavar="N", + type=int, + default=10, + help="Max rows to process. Useful for debugging. If specified an ipdb " + "debugging session is also opened at the end of the script") +parser.add_argument( + "--lengths", + metavar="N", + type=int, + nargs="+", + default=[8,9,10,11], + help="Peptide lengths") + + +def run(): + args = parser.parse_args(sys.argv[1:]) + + df_original = pandas.read_csv(args.input) + df = df_original + print("Read peptides", df.shape, *df.columns.tolist()) + + reference_df = pandas.read_csv(args.reference_csv, index_col=0) + reference_df = reference_df.set_index("accession") + print("Read proteins", reference_df.shape, *reference_df.columns.tolist()) + + print("Subselecting to MHC I hits. Before: ", len(df)) + df = df.loc[df.mhc_class == "I"] + print("After: ", len(df)) + + print("Subselecting to gene-associated hits. Before: ", len(df)) + df = df.loc[~df.protein_ensembl_primary.isnull()] + print("After: ", len(df)) + + (flanking_length,) = list( + set(df.n_flank.str.len().unique()).union( + set(df.n_flank.str.len().unique()))) + print("Flanking length", flanking_length) + + proteins = df.protein_accession.unique() + + if args.debug_max_rows: + proteins = proteins[:args.debug_max_rows] + + print("Writing decoys for %d proteins" % len(proteins)) + + reference_df = reference_df.loc[proteins] + + lengths = sorted(args.lengths) + rows = [] + total = len(reference_df) + for (accession, info) in tqdm.tqdm(reference_df.iterrows(), total=total): + seq = info.seq + for start in range(0, len(seq) - min(args.lengths)): + for length in lengths: + end_pos = start + length + if end_pos > len(seq): + break + n_flank = seq[ + max(start - flanking_length, 0) : start + ].rjust(flanking_length, 'X') + c_flank = seq[ + end_pos : (end_pos + flanking_length) + ].ljust(flanking_length, 'X') + peptide = seq[start : start + length] + + rows.append(( + accession, + peptide, + n_flank, + c_flank, + start + )) + + result_df = pandas.DataFrame( + rows, + columns=[ + "protein_accession", + "peptide", + "n_flank", + "c_flank", + "start_position", + ]) + + result_df.to_csv(args.out, index=False) + print("Wrote: %s" % os.path.abspath(args.out)) + + if args.debug_max_rows: + # Leave user in a debugger + import ipdb + ipdb.set_trace() + + +if __name__ == '__main__': + run() diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index e8499087b796430ae08f0b73d846e2a6886ec0ac..6ae2b54971bc7bda301e4e59f2576178cbc2500c 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -29,6 +29,10 @@ releases: - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa default: false + - name: data_mass_spec_annotated + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2 + default: false + - name: data_references url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 default: false