Skip to content
Snippets Groups Projects
Commit 9192933a authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

First cut on data_references and data_mass_spec_annotated downloads

parent 15b85d7c
No related merge requests found
#!/bin/bash
#
#
set -e
set -x
DOWNLOAD_NAME=data_mass_spec_annotated
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py .
INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
python annotate.py "$INPUT" --out annotated_ms.csv
exit 1
bzip2 annotated_ms.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
"""
"""
import sys
import argparse
import os
import collections
from six.moves import StringIO
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input_path",
help="Item to curate: PMID and list of files")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
# Build index
PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm"
USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
print("Using prebuilt index", USE_PREBUILT_INDEX)
fm = shellinford.FMIndex()
if USE_PREBUILT_INDEX:
fm.read(PREBUILT_INDEX)
fm_keys = []
protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
start = time.time()
proteome_df = []
with gzip.open(protein_database, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
if i % 10000 == 0:
print(i, time.time() - start)
fm_keys.append(record.name)
proteome_df.append((record.name, record.description, str(record.seq)))
if not USE_PREBUILT_INDEX:
fm.push_back("$" + str(record.seq) + "$") # include sentinels
if not USE_PREBUILT_INDEX:
print("Building")
start = time.time()
fm.build()
print("Done building", time.time() - start)
fm.write(PREBUILT_INDEX)
proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
proteome_df
SEARCH_CACHE = {}
def search(peptide, fm=fm):
if peptide in SEARCH_CACHE:
return SEARCH_CACHE[peptide]
hits = fm.search(peptide)
result = proteome_df.iloc[
[hit.doc_id for hit in hits]
]
assert result.seq.str.contains(peptide).all(), (peptide, result)
names = result.index.tolist()
SEARCH_CACHE[peptide] = names
return names
print(search("SIINFEKL"))
print(search("AAAAAKVPA"))
print(search("AAAAALQAK"))
print(search("DEGPLDVSM"))
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.input_path)
print("Read input", df.shape)
import ipdb ; ipdb.set_trace()
df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if __name__ == '__main__':
run()
#!/bin/bash
#
#
#
set -e
set -x
DOWNLOAD_NAME=data_references
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
date
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/process.py .
############################################
# UNIPROT
############################################
#
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.fasta.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.gene2acc.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.idmapping.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
python process.py \
UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
--out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm
ls -lh uniprot_proteins.csv uniprot_proteins.fm
bzip2 uniprot_proteins.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
# data_mass_spec_annotated
On OS X, if you encounter problem installing shellinford, try this:
```
CXXFLAGS="-stdlib=libc++" CPPFLAGS="-stdlib=libc++" pip install shellinford
```
"""
"""
import sys
import argparse
import os
import gzip
import pandas
import shellinford
from Bio import SeqIO
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input_paths",
nargs="+",
help="Fasta files to process")
parser.add_argument(
"--out-csv",
required=True,
metavar="FILE.csv",
help="CSV output")
parser.add_argument(
"--out-index",
required=True,
metavar="FILE.fm",
help="Index output")
def run():
args = parser.parse_args(sys.argv[1:])
fm = shellinford.FMIndex()
df = []
for f in args.input_paths:
print("Processing", f)
with gzip.open(f, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
seq = str(record.seq).upper()
df.append((record.name, record.description, seq))
fm.push_back("$" + seq + "$") # include sentinels
df = pandas.DataFrame(df, columns=["name", "description", "seq"])
print("Done reading fastas")
print(df)
print("Building index")
fm.build()
fm.write(args.out_index)
print("Wrote: ", os.path.abspath((args.out_index)))
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
if __name__ == '__main__':
run()
shellinford
biopython
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment