Skip to content
Snippets Groups Projects
Commit 1687def1 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add data_mass_spec_benchmark download

parent a3325b8f
No related branches found
No related tags found
No related merge requests found
......@@ -35,7 +35,6 @@ python annotate.py \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
"${REFERENCES_DIR}/uniprot_proteins.fm" \
--out annotated_ms.csv
bzip2 annotated_ms.csv
cp $SCRIPT_ABSOLUTE_PATH .
......
#!/bin/bash
#
#
set -e
set -x
DOWNLOAD_NAME=data_mass_spec_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py .
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/all_sequences.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.csv
ls -lh proteome_peptides.csv
bzip2 proteome_peptides.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
"""
"""
import sys
import argparse
import os
import time
import collections
import re
from six.moves import StringIO
import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
import shellinford
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input",
metavar="FILE.csv",
help="CSV of annotated mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug-max-rows",
metavar="N",
type=int,
default=10,
help="Max rows to process. Useful for debugging. If specified an ipdb "
"debugging session is also opened at the end of the script")
parser.add_argument(
"--lengths",
metavar="N",
type=int,
nargs="+",
default=[8,9,10,11],
help="Peptide lengths")
def run():
args = parser.parse_args(sys.argv[1:])
df_original = pandas.read_csv(args.input)
df = df_original
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
print("Subselecting to MHC I hits. Before: ", len(df))
df = df.loc[df.mhc_class == "I"]
print("After: ", len(df))
print("Subselecting to gene-associated hits. Before: ", len(df))
df = df.loc[~df.protein_ensembl_primary.isnull()]
print("After: ", len(df))
(flanking_length,) = list(
set(df.n_flank.str.len().unique()).union(
set(df.n_flank.str.len().unique())))
print("Flanking length", flanking_length)
proteins = df.protein_accession.unique()
if args.debug_max_rows:
proteins = proteins[:args.debug_max_rows]
print("Writing decoys for %d proteins" % len(proteins))
reference_df = reference_df.loc[proteins]
lengths = sorted(args.lengths)
rows = []
total = len(reference_df)
for (accession, info) in tqdm.tqdm(reference_df.iterrows(), total=total):
seq = info.seq
for start in range(0, len(seq) - min(args.lengths)):
for length in lengths:
end_pos = start + length
if end_pos > len(seq):
break
n_flank = seq[
max(start - flanking_length, 0) : start
].rjust(flanking_length, 'X')
c_flank = seq[
end_pos : (end_pos + flanking_length)
].ljust(flanking_length, 'X')
peptide = seq[start : start + length]
rows.append((
accession,
peptide,
n_flank,
c_flank,
start
))
result_df = pandas.DataFrame(
rows,
columns=[
"protein_accession",
"peptide",
"n_flank",
"c_flank",
"start_position",
])
result_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if args.debug_max_rows:
# Leave user in a debugger
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
run()
......@@ -29,6 +29,10 @@ releases:
- https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
default: false
- name: data_mass_spec_annotated
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2
default: false
- name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment