Skip to content
Snippets Groups Projects
Commit 48d6eed2 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent ab2ca5d0
No related merge requests found
...@@ -29,23 +29,33 @@ git status ...@@ -29,23 +29,33 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py . cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/curate_by_pmid.py . cp $SCRIPT_DIR/curate_ms_by_pmid.py .
RAW_DIR="$(mhcflurry-downloads path data_published)/raw" MS_DIR="$(mhcflurry-downloads path data_published)/ms"
cp -r "$RAW_DIR" . cp -r "$MS_DIR" .
EXPRESSION_DIR="$(mhcflurry-downloads path data_published)/expression"
cp -r "$EXPRESSION_DIR" .
CURATE_BY_PMID_ARGS="" CURATE_BY_PMID_ARGS=""
for pmid in $(ls raw) for pmid in $(ls ms)
do
CURATE_BY_PMID_ARGS+=$(echo --ms-item $pmid ms/$pmid/* ' ')
done
for item in $(ls expression)
do do
CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ') CURATE_BY_PMID_ARGS+=$(echo --expression-item $item expression/$item/* ' ')
done done
time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \ time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \
--out nontraining_curated.by_pmid.csv --ms-out ms.nontraining_curated.by_pmid.csv \
--expression-out rna_expression.csv
bzip2 ms.nontraining_curated.by_pmid.csv
bzip2 rna_expression.csv
bzip2 nontraining_curated.by_pmid.csv
rm -rf raw rm -rf ms
# No mass-spec data # No mass-spec data
time python curate.py \ time python curate.py \
......
...@@ -33,7 +33,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.200 ...@@ -33,7 +33,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.200
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
mkdir raw mkdir ms
############################################ ############################################
# MS: Multiallelic class I # MS: Multiallelic class I
...@@ -43,10 +43,10 @@ mkdir raw ...@@ -43,10 +43,10 @@ mkdir raw
# Pearson, ..., Perreault JCI 2016 [PMID 27841757] # Pearson, ..., Perreault JCI 2016 [PMID 27841757]
# but was reanalyzed in this work, and we download the reanalyzed version here. # but was reanalyzed in this work, and we download the reanalyzed version here.
PMID=28832583 PMID=28832583
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P raw/$PMID # data generated in this work wget -q https://doi.org/10.1371/journal.pcbi.1005725.s002 -P ms/$PMID # data generated in this work
wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P raw/$PMID # data reanalyzed in this work wget -q https://doi.org/10.1371/journal.pcbi.1005725.s003 -P ms/$PMID # data reanalyzed in this work
cd raw/$PMID cd ms/$PMID
unzip *.s002 unzip *.s002
unzip *.s003 unzip *.s003
mkdir saved mkdir saved
...@@ -58,33 +58,33 @@ cd ../.. ...@@ -58,33 +58,33 @@ cd ../..
# Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301] # Bassani-Sternberg, ..., Mann Mol Cell Proteomics 2015 [PMID 25576301]
PMID=25576301 PMID=25576301
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P raw/$PMID wget -q https://www.mcponline.org/highwire/filestream/35026/field_highwire_adjunct_files/7/mcp.M114.042812-4.xlsx -P ms/$PMID
# Mommen, ..., Heck PNAS 2014 [PMID 24616531] # Mommen, ..., Heck PNAS 2014 [PMID 24616531]
PMID=24616531 PMID=24616531
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P raw/$PMID wget -q https://www.pnas.org/highwire/filestream/615485/field_highwire_adjunct_files/1/sd01.xlsx -P ms/$PMID
# Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516] # Gloger, ..., Neri Cancer Immunol Immunother 2016 [PMID 27600516]
# Data extracted from supplemental PDF table. # Data extracted from supplemental PDF table.
PMID=27600516 PMID=27600516
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P raw/$PMID wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/27600516.peptides.csv -P ms/$PMID
# Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070] # Ritz, ..., Fugmann Proteomics 2016 [PMID 26992070]
# Supplemental zip downloaded from publication # Supplemental zip downloaded from publication
PMID=26992070 PMID=26992070
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P raw/$PMID wget -q https://github.com/openvax/mhcflurry/releases/download/pan-dev1/pmic12297-sup-0001-supinfo.zip -P ms/$PMID
cd raw/$PMID cd ms/$PMID
unzip pmic12297-sup-0001-supinfo.zip unzip pmic12297-sup-0001-supinfo.zip
cd ../.. cd ../..
# Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690] # Shraibman, ..., Admon Mol Cell Proteomics 2016 [PMID 27412690]
PMID=27412690 PMID=27412690
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P raw/$PMID wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1/mcp.M116.060350-2.xlsx -P ms/$PMID
# Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757] # Pearson, ..., Perreault J Clin Invest 2016 [PMID 27841757]
# Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it. # Note: we do not use the original data from this publicaton, we use 28832583's reanalysis of it.
...@@ -92,17 +92,59 @@ wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1 ...@@ -92,17 +92,59 @@ wget -q https://www.mcponline.org/lookup/suppl/doi:10.1074/mcp.M116.060350/-/DC1
# Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700] # Hassan, ..., van Veelen Mol Cell Proteomics 2015 [PMID 23481700]
PMID=23481700 PMID=23481700
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P raw/$PMID wget -q https://www.mcponline.org/highwire/filestream/34681/field_highwire_adjunct_files/1/mcp.M112.024810-2.xls -P ms/$PMID
# Shraibman, ..., Admon Mol Cell Proteomics 2019 [PMID 31154438]
PMID=31154438
mkdir -p ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/3/zjw006195963st2.txt -P ms/$PMID
wget -q https://www.mcponline.org/highwire/filestream/51948/field_highwire_adjunct_files/1/zjw006195963st1.xlsx -P ms/$PMID
# Bassani-Sternberg, ..., Krackhardt Nature Comm. 2016 [PMID 27869121]
PMID=27869121
mkdir -p ms/$PMID
wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fncomms13404/MediaObjects/41467_2016_BFncomms13404_MOESM1318_ESM.xlsx" -P ms/$PMID
############################################ ############################################
# MS: Monoallelic class II # MS: Monoallelic class II
############################################ ############################################
# Abelin, ..., Rooney Immunity 2019 [PMID 31495665] # Abelin, ..., Rooney Immunity 2019 [PMID 31495665]
PMID=31495665 PMID=31495665
mkdir -p raw/$PMID mkdir -p ms/$PMID
wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P raw/$PMID wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P ms/$PMID
############################################
# RNA-seq expression data (TPMs)
############################################
# CCLE as processed by expression atlas
DATASET=expression-atlas-22460905
mkdir -p expression/$DATASET
wget -q https://www.ebi.ac.uk/gxa/experiments-content/E-MTAB-2770/resources/ExperimentDownloadSupplier.RnaSeqBaseline/tpms.tsv -P expression/$DATASET
# Human protein atlas
DATASET=human-protein-atlas
mkdir -p expression/$DATASET
cd expression/$DATASET
wget -q https://www.proteinatlas.org/download/rna_celline.tsv.zip
wget -q https://www.proteinatlas.org/download/rna_blood_cell_sample_tpm_m.tsv.zip
wget -q https://www.proteinatlas.org/download/rna_tissue_gtex.tsv.zip
for i in $(ls *.zip)
do
unzip $i
rm $i
done
cd ../..
# Melanoma. Original publication
# Barry, ..., Krummel Nature Medicine 2018 [PMID 29942093].
DATASET=GSE113126
mkdir -p expression/$DATASET
cd expression/$DATASET
wget -q "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113126&format=file" -O GSE113126_RAW.tar
tar -xvf GSE113126_RAW.tar
rm GSE113126_RAW.tar
cd ../..
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt bzip2 LOG.txt
......
...@@ -62,15 +62,15 @@ if [ "$2" != "continue-incomplete" ] ...@@ -62,15 +62,15 @@ if [ "$2" != "continue-incomplete" ]
then then
cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py . cp $SCRIPT_DIR/generate_hyperparameters.py .
python generate_hyperparameters.production.py > hyperparameters.production.json python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.json no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
fi fi
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
do do
CONTINUE_INCOMPLETE_ARGS="" CONTINUE_INCOMPLETE_ARGS=""
if [ "$2" == "continue-incomplete" ] && [ -d "models.${kind}" ] if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ]
then then
echo "Will continue existing run: $kind" echo "Will continue existing run: $kind"
CONTINUE_INCOMPLETE_ARGS="--continue-incomplete" CONTINUE_INCOMPLETE_ARGS="--continue-incomplete"
......
...@@ -37,10 +37,6 @@ releases: ...@@ -37,10 +37,6 @@ releases:
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2
default: false default: false
- name: data_expression
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_expression.20191009.tar.bz2
default: false
- name: data_references - name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false default: false
...@@ -62,7 +58,7 @@ releases: ...@@ -62,7 +58,7 @@ releases:
default: false default: false
- name: data_published - name: data_published
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_published.20190924.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_published.20191011.tar.bz2
default: false default: false
- name: data_curated - name: data_curated
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment