From 92720e6b988fa3f3b0e2b9a6a8916a7b78a4a714 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 9 Oct 2019 12:36:55 -0400 Subject: [PATCH] Add data_expression download --- .../data_expression/GENERATE.sh | 57 +++++++++++++++++++ mhcflurry/downloads.yml | 4 ++ 2 files changed, 61 insertions(+) create mode 100755 downloads-generation/data_expression/GENERATE.sh diff --git a/downloads-generation/data_expression/GENERATE.sh b/downloads-generation/data_expression/GENERATE.sh new file mode 100755 index 00000000..c780b42f --- /dev/null +++ b/downloads-generation/data_expression/GENERATE.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Download published gene expression data corresponding to some of our mass +# spec datasets. +# +# +set -e +set -x + +DOWNLOAD_NAME=data_expression +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +date + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +# Many cell line sequencing data is available from: +# Available from SRA [access required] at: +# https://www.ebi.ac.uk/ega/studies/EGAS00001000610 + +# CCLE cell lines +DATASET=ccle +mkdir $DATASET +cd $DATASET +wget -q https://data.broadinstitute.org/ccle/CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz +wget -q https://data.broadinstitute.org/ccle/CCLE_miRNA_20181103.gct +cd .. + +# B721.221 +DATASET=b721221 +mkdir $DATASET +cd $DATASET +wget -q https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE93315&format=file +cd .. + +DATASET=pancan-xena +mkdir $DATASET +cd $DATASET +wget -q https://pancanatlas.xenahubs.net/download/probeMap/hugo_gencode_good_hg19_V24lift37_probemap +wget -q https://pancanatlas.xenahubs.net/download/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz +cd .. + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 76a9360d..4089fb78 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -37,6 +37,10 @@ releases: url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2 default: false + - name: data_expression + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_expression.20191009.tar.bz2 + default: false + - name: data_references url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 default: false -- GitLab