From 9192933adbaf5b6babad633a0d46b2bb2e35130b Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 27 Sep 2019 14:39:27 -0400 Subject: [PATCH] First cut on data_references and data_mass_spec_annotated downloads --- .../data_mass_spec_annotated/GENERATE.sh | 42 ++++++++ .../data_mass_spec_annotated/annotate.py | 98 +++++++++++++++++++ .../data_references/GENERATE.sh | 51 ++++++++++ .../data_references/README.md | 7 ++ .../data_references/process.py | 58 +++++++++++ .../data_references/requirements.txt | 2 + 6 files changed, 258 insertions(+) create mode 100755 downloads-generation/data_mass_spec_annotated/GENERATE.sh create mode 100755 downloads-generation/data_mass_spec_annotated/annotate.py create mode 100755 downloads-generation/data_references/GENERATE.sh create mode 100644 downloads-generation/data_references/README.md create mode 100755 downloads-generation/data_references/process.py create mode 100644 downloads-generation/data_references/requirements.txt diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh new file mode 100755 index 00000000..96d11498 --- /dev/null +++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# +set -e +set -x + +DOWNLOAD_NAME=data_mass_spec_annotated +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") +export PYTHONUNBUFFERED=1 + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +cp $SCRIPT_DIR/annotate.py . + +INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 + +python annotate.py "$INPUT" --out annotated_ms.csv + +exit 1 + +bzip2 annotated_ms.csv + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/downloads-generation/data_mass_spec_annotated/annotate.py b/downloads-generation/data_mass_spec_annotated/annotate.py new file mode 100755 index 00000000..98cdabb0 --- /dev/null +++ b/downloads-generation/data_mass_spec_annotated/annotate.py @@ -0,0 +1,98 @@ +""" +""" +import sys +import argparse +import os +import collections +from six.moves import StringIO + +import pandas + +import mhcnames + + +def normalize_allele_name(s): + try: + return mhcnames.normalize_allele_name(s) + except Exception: + return "UNKNOWN" + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "input_path", + help="Item to curate: PMID and list of files") +parser.add_argument( + "--out", + metavar="OUT.csv", + help="Out file path") + + +# Build index +PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm" +USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX) +print("Using prebuilt index", USE_PREBUILT_INDEX) + +fm = shellinford.FMIndex() +if USE_PREBUILT_INDEX: + fm.read(PREBUILT_INDEX) + +fm_keys = [] +protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz" +start = time.time() +proteome_df = [] +with gzip.open(protein_database, "rt") as fd: + records = SeqIO.parse(fd, format='fasta') + for (i, record) in enumerate(records): + if i % 10000 == 0: + print(i, time.time() - start) + fm_keys.append(record.name) + proteome_df.append((record.name, record.description, str(record.seq))) + if not USE_PREBUILT_INDEX: + fm.push_back("$" + str(record.seq) + "$") # include sentinels + +if not USE_PREBUILT_INDEX: + print("Building") + start = time.time() + fm.build() + print("Done building", time.time() - start) + fm.write(PREBUILT_INDEX) + +proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name") +proteome_df + +SEARCH_CACHE = {} +def search(peptide, fm=fm): + if peptide in SEARCH_CACHE: + return SEARCH_CACHE[peptide] + hits = fm.search(peptide) + result = proteome_df.iloc[ + [hit.doc_id for hit in hits] + ] + assert result.seq.str.contains(peptide).all(), (peptide, result) + names = result.index.tolist() + SEARCH_CACHE[peptide] = names + return names +print(search("SIINFEKL")) +print(search("AAAAAKVPA")) +print(search("AAAAALQAK")) +print(search("DEGPLDVSM")) + + + + +def run(): + args = parser.parse_args(sys.argv[1:]) + + df = pandas.read_csv(args.input_path) + print("Read input", df.shape) + + import ipdb ; ipdb.set_trace() + + df.to_csv(args.out, index=False) + print("Wrote: %s" % os.path.abspath(args.out)) + + +if __name__ == '__main__': + run() diff --git a/downloads-generation/data_references/GENERATE.sh b/downloads-generation/data_references/GENERATE.sh new file mode 100755 index 00000000..00575b08 --- /dev/null +++ b/downloads-generation/data_references/GENERATE.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# +# +set -e +set -x + +DOWNLOAD_NAME=data_references +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +date + +cd $SCRATCH_DIR/$DOWNLOAD_NAME +cp $SCRIPT_DIR/process.py . + + +############################################ +# UNIPROT +############################################ +# +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.fasta.gz +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.gene2acc.gz +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.idmapping.gz +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz +wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz + + +python process.py \ + UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \ + --out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm + +ls -lh uniprot_proteins.csv uniprot_proteins.fm + +bzip2 uniprot_proteins.csv + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/downloads-generation/data_references/README.md b/downloads-generation/data_references/README.md new file mode 100644 index 00000000..c429f35e --- /dev/null +++ b/downloads-generation/data_references/README.md @@ -0,0 +1,7 @@ +# data_mass_spec_annotated + +On OS X, if you encounter problem installing shellinford, try this: + +``` +CXXFLAGS="-stdlib=libc++" CPPFLAGS="-stdlib=libc++" pip install shellinford +``` diff --git a/downloads-generation/data_references/process.py b/downloads-generation/data_references/process.py new file mode 100755 index 00000000..8d03464e --- /dev/null +++ b/downloads-generation/data_references/process.py @@ -0,0 +1,58 @@ +""" +""" +import sys +import argparse +import os +import gzip + +import pandas +import shellinford +from Bio import SeqIO + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "input_paths", + nargs="+", + help="Fasta files to process") +parser.add_argument( + "--out-csv", + required=True, + metavar="FILE.csv", + help="CSV output") +parser.add_argument( + "--out-index", + required=True, + metavar="FILE.fm", + help="Index output") + + +def run(): + args = parser.parse_args(sys.argv[1:]) + + fm = shellinford.FMIndex() + df = [] + for f in args.input_paths: + print("Processing", f) + with gzip.open(f, "rt") as fd: + records = SeqIO.parse(fd, format='fasta') + for (i, record) in enumerate(records): + seq = str(record.seq).upper() + df.append((record.name, record.description, seq)) + fm.push_back("$" + seq + "$") # include sentinels + df = pandas.DataFrame(df, columns=["name", "description", "seq"]) + + print("Done reading fastas") + print(df) + + print("Building index") + fm.build() + fm.write(args.out_index) + print("Wrote: ", os.path.abspath((args.out_index))) + + df.to_csv(args.out_csv, index=True) + print("Wrote: ", os.path.abspath((args.out_csv))) + + +if __name__ == '__main__': + run() diff --git a/downloads-generation/data_references/requirements.txt b/downloads-generation/data_references/requirements.txt new file mode 100644 index 00000000..fd4acf48 --- /dev/null +++ b/downloads-generation/data_references/requirements.txt @@ -0,0 +1,2 @@ +shellinford +biopython -- GitLab