diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh index 96d11498cacec8c152cd0c6418b775724cabf3e4..e5851c1ea15f6e106a121edd26e115807bc37199 100755 --- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh +++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh @@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date @@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/annotate.py . -INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 +PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 +REFERENCES_DIR=$(mhcflurry-downloads path data_references) -python annotate.py "$INPUT" --out annotated_ms.csv - -exit 1 +python annotate.py \ + "$PEPTIDES" \ + "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ + "${REFERENCES_DIR}/uniprot_proteins.fm" \ + --out annotated_ms.csv bzip2 annotated_ms.csv diff --git a/downloads-generation/data_mass_spec_annotated/annotate.py b/downloads-generation/data_mass_spec_annotated/annotate.py index 98cdabb076691f5ef6d564c834bdc17e79b9ea73..56b6b3a654252c270ec8c851753ed199cdfc00c6 100755 --- a/downloads-generation/data_mass_spec_annotated/annotate.py +++ b/downloads-generation/data_mass_spec_annotated/annotate.py @@ -3,94 +3,85 @@ import sys import argparse import os +import time import collections from six.moves import StringIO import pandas +import tqdm # progress bar +tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 -import mhcnames - - -def normalize_allele_name(s): - try: - return mhcnames.normalize_allele_name(s) - except Exception: - return "UNKNOWN" +import shellinford parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( - "input_path", - help="Item to curate: PMID and list of files") + "peptides", + metavar="FILE.csv", + help="CSV of mass spec hits") +parser.add_argument( + "reference_csv", + metavar="FILE.csv", + help="CSV of protein sequences") +parser.add_argument( + "reference_index", + metavar="FILE.fm", + help="shellinford index over protein sequences") parser.add_argument( "--out", metavar="OUT.csv", help="Out file path") -# Build index -PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm" -USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX) -print("Using prebuilt index", USE_PREBUILT_INDEX) - -fm = shellinford.FMIndex() -if USE_PREBUILT_INDEX: - fm.read(PREBUILT_INDEX) - -fm_keys = [] -protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz" -start = time.time() -proteome_df = [] -with gzip.open(protein_database, "rt") as fd: - records = SeqIO.parse(fd, format='fasta') - for (i, record) in enumerate(records): - if i % 10000 == 0: - print(i, time.time() - start) - fm_keys.append(record.name) - proteome_df.append((record.name, record.description, str(record.seq))) - if not USE_PREBUILT_INDEX: - fm.push_back("$" + str(record.seq) + "$") # include sentinels - -if not USE_PREBUILT_INDEX: - print("Building") - start = time.time() - fm.build() - print("Done building", time.time() - start) - fm.write(PREBUILT_INDEX) - -proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name") -proteome_df - -SEARCH_CACHE = {} -def search(peptide, fm=fm): - if peptide in SEARCH_CACHE: - return SEARCH_CACHE[peptide] - hits = fm.search(peptide) - result = proteome_df.iloc[ - [hit.doc_id for hit in hits] - ] - assert result.seq.str.contains(peptide).all(), (peptide, result) - names = result.index.tolist() - SEARCH_CACHE[peptide] = names - return names -print(search("SIINFEKL")) -print(search("AAAAAKVPA")) -print(search("AAAAALQAK")) -print(search("DEGPLDVSM")) +def run(): + args = parser.parse_args(sys.argv[1:]) + df = pandas.read_csv(args.peptides) + df["hit_id"] = "hit." + df.index.map(str) + df = df.set_index("hit_id") + print("Read peptides", df.shape, *df.columns.tolist()) + reference_df = pandas.read_csv(args.reference_csv, index_col=0) + reference_df = reference_df.set_index("accession") + print("Read proteins", reference_df.shape, *reference_df.columns.tolist()) + fm = shellinford.FMIndex() + fm.read(args.reference_index) + print("Read proteins index") -def run(): - args = parser.parse_args(sys.argv[1:]) + join_df = [] + for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)): + matches = fm.search(row.peptide) + for match in matches: + join_df.append((hit_id, match.doc_id, len(matches))) - df = pandas.read_csv(args.input_path) - print("Read input", df.shape) + join_df = pandas.DataFrame( + join_df, + columns=["hit_id", "match_index", "num_proteins"], + ) - import ipdb ; ipdb.set_trace() + join_df["protein_accession"] = join_df.match_index.map( + reference_df.index.to_series().reset_index(drop=True)) - df.to_csv(args.out, index=False) + del join_df["match_index"] + + protein_cols = [ + c for c in reference_df.columns + if c not in ["name", "description", "seq"] + ] + for col in protein_cols: + join_df["protein_%s" % col] = join_df.protein_accession.map( + reference_df[col]) + + merged_df = pandas.merge( + join_df, + df, + how="left", + left_on="hit_id", + right_index=True) + + merged_df.to_csv(args.out, index=False) print("Wrote: %s" % os.path.abspath(args.out)) diff --git a/downloads-generation/data_mass_spec_annotated/requirements.txt b/downloads-generation/data_mass_spec_annotated/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..917704c491264bd5002fd01ca2647b7df40585b5 --- /dev/null +++ b/downloads-generation/data_mass_spec_annotated/requirements.txt @@ -0,0 +1,2 @@ +shellinford + diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 3e05c91e5b5ddf92045db2feff828e7dde51f132..e8499087b796430ae08f0b73d846e2a6886ec0ac 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -29,6 +29,10 @@ releases: - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa default: false + - name: data_references + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 + default: false + - name: data_iedb url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2 default: false