diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index f0a3d54aece76755ab92f1a6764243f014f112b4..1f2b7067488963912dea57932ff60f36fa55ac42 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -34,10 +34,23 @@ time python curate.py \ --data-iedb \ "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \ --data-kim2014 \ - "$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \ - --out-csv curated_training_data.csv + "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ + --out-csv curated_training_data.no_mass_spec.csv + +time python curate.py \ + --data-iedb \ + "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \ + --data-kim2014 \ + "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ + --data-systemhc-atlas \ + "$(mhcflurry-downloads path data_systemhcatlas)/data.csv.bz2" \ + --data-abelin-mass-spec \ + "$(mhcflurry-downloads path data_published)/abelin2017.hits.csv.bz2" \ + --out-csv curated_training_data.with_mass_spec.csv + +bzip2 curated_training_data.no_mass_spec.csv +bzip2 curated_training_data.with_mass_spec.csv -bzip2 curated_training_data.csv cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 2650a5bb67c4d19d11d506318c2afe1115f7fc27..fbb503a471cf91b36b58e089435794deee1946a2 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -29,14 +29,30 @@ parser.add_argument( action="append", default=[], help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)") +parser.add_argument( + "--data-systemhc-atlas", + action="append", + default=[], + help="Path to systemhc-atlas-style mass-spec data") +parser.add_argument( + "--data-abelin-mass-spec", + action="append", + default=[], + help="Path to Abelin Immunity 2017 mass-spec hits") +parser.add_argument( + "--include-mass-spec", + action="store_true", + default=False, + help="Include mass-spec observations in IEDB") + parser.add_argument( "--out-csv", required=True, help="Result file") QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = { - "Negative": (20000.0, ">"), - "Positive": (500.0, "<"), + "Negative": (5000.0, ">"), + "Positive": (500.0, "<"), # used for mass-spec hits "Positive-High": (100.0, "<"), "Positive-Intermediate": (1000.0, "<"), "Positive-Low": (5000.0, "<"), @@ -76,7 +92,58 @@ def load_data_kim2014(filename): return df -def load_data_iedb(iedb_csv, include_qualitative=True): +def load_data_systemhc_atlas(filename, min_probability=0.99): + df = pandas.read_csv(filename) + print("Loaded systemhc atlas data: %s" % str(df.shape)) + + df["measurement_source"] = "systemhc-atlas" + df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] + df["measurement_inequality"] = "<" + df["measurement_type"] = "qualitative" + df["original_allele"] = df.top_allele + df["peptide"] = df.search_hit + df["allele"] = df.top_allele.map(normalize_allele_name) + + print("Dropping un-parseable alleles: %s" % ", ".join( + str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique())) + df = df.loc[df.allele != "UNKNOWN"] + print("Systemhc atlas data now: %s" % str(df.shape)) + + print("Dropping data points with probability < %f" % min_probability) + df = df.loc[df.prob >= min_probability] + print("Systemhc atlas data now: %s" % str(df.shape)) + + print("Removing duplicates") + df = df.drop_duplicates(["allele", "peptide"]) + print("Systemhc atlas data now: %s" % str(df.shape)) + + return df + + +def load_data_abelin_mass_spec(filename): + df = pandas.read_csv(filename) + print("Loaded Abelin mass-spec data: %s" % str(df.shape)) + + df["measurement_source"] = "abelin-mass-spec" + df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] + df["measurement_inequality"] = "<" + df["measurement_type"] = "qualitative" + df["original_allele"] = df.allele + df["allele"] = df.original_allele.map(normalize_allele_name) + + print("Dropping un-parseable alleles: %s" % ", ".join( + str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique())) + df = df.loc[df.allele != "UNKNOWN"] + print("Abelin mass-spec data now: %s" % str(df.shape)) + + print("Removing duplicates") + df = df.drop_duplicates(["allele", "peptide"]) + print("Abelin mass-spec data now: %s" % str(df.shape)) + + return df + + +def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False) print("Loaded iedb data: %s" % str(iedb_df.shape)) @@ -110,9 +177,10 @@ def load_data_iedb(iedb_csv, include_qualitative=True): qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy() qualitative["measurement_type"] = "qualitative" print("Qualitative measurements: %d" % len(qualitative)) - #qualitative = qualitative.ix[ - # (~qualitative["Method/Technique"].str.contains("mass spec")) - #].copy() + if not include_mass_spec: + qualitative = qualitative.ix[ + (~qualitative["Method/Technique"].str.contains("mass spec")) + ].copy() qualitative["Quantitative measurement"] = ( qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY)) @@ -169,7 +237,7 @@ def run(): dfs = [] for filename in args.data_iedb: - df = load_data_iedb(filename) + df = load_data_iedb(filename, include_mass_spec=args.include_mass_spec) dfs.append(df) for filename in args.data_kim2014: df = load_data_kim2014(filename) @@ -185,8 +253,20 @@ def run(): ] print("Kim2014 data now: %s" % str(df.shape)) dfs.append(df) + for filename in args.data_systemhc_atlas: + df = load_data_systemhc_atlas(filename) + dfs.append(df) + for filename in args.data_abelin_mass_spec: + df = load_data_abelin_mass_spec(filename) + dfs.append(df) df = pandas.concat(dfs, ignore_index=True) + print("Combined df: %s" % (str(df.shape))) + + print("Removing combined duplicates") + df = df.drop_duplicates(["allele", "peptide", "measurement_value"]) + print("New combined df: %s" % (str(df.shape))) + df = df[[ "allele", "peptide", @@ -197,7 +277,7 @@ def run(): "original_allele", ]].sort_values(["allele", "peptide"]).dropna() - print("Combined df: %s" % (str(df.shape))) + print("Final combined df: %s" % (str(df.shape))) df.to_csv(args.out_csv, index=False) print("Wrote: %s" % args.out_csv) diff --git a/downloads-generation/data_kim2014/README.md b/downloads-generation/data_kim2014/README.md deleted file mode 100644 index bf42e01ccade69b63172d342c92a41d9e0497dcb..0000000000000000000000000000000000000000 --- a/downloads-generation/data_kim2014/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Kim 2014 Data - -This download contains the BD2009, BD2013, and BLIND datasets from [Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). BD2013 (augmented with more recent data from IEDB) are used to train the production MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data. - -These files are available on dropbox here: - - * https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt - * https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt - * https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt - -To generate this download run: - -``` -./GENERATE.sh -``` \ No newline at end of file diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..916a901e160340f03f2039a640c44bf2460bff55 --- /dev/null +++ b/downloads-generation/data_published/GENERATE.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# Download some published MHC I ligand data +# +# +set -e +set -x + +DOWNLOAD_NAME=data_published +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +# git rev-parse HEAD +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +# Download kim2014 data +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt + +# Download abelin et al 2017 data +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2 + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/data_published/README.md b/downloads-generation/data_published/README.md new file mode 100644 index 0000000000000000000000000000000000000000..807adbef7bc6a293c4bd71f5fe1af13f33c47e5d --- /dev/null +++ b/downloads-generation/data_published/README.md @@ -0,0 +1,24 @@ +# Published datasets + +These datasets are derived from publications and do not change. + +To generate this download run: + +``` +./GENERATE.sh +``` + +## Kim 2014 + +This download contains the BD2009, BD2013, and BLIND datasets from +[Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). + +BD2013 (augmented with more recent data from IEDB) are used to train the production +MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data. + + +## Abelin et al. Immunity 2017 + +This download contains the peptides identified in +[Mass Spectrometry Profiling of HLA-Associated Peptidomes in Mono-allelic Cells Enables More Accurate Epitope Prediction](https://www.ncbi.nlm.nih.gov/pubmed/28228285). + diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_systemhcatlas/GENERATE.sh similarity index 66% rename from downloads-generation/data_kim2014/GENERATE.sh rename to downloads-generation/data_systemhcatlas/GENERATE.sh index dbda0fe8c12df076e5e8f3b96728eefda51f23c6..1558409c2b981591681ae726a641e51283935ad9 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_systemhcatlas/GENERATE.sh @@ -1,12 +1,12 @@ #!/bin/bash # -# Download some published MHC I ligand data from a location on Dropbox. +# Download some published MHC I ligands identified by mass-spec # # set -e set -x -DOWNLOAD_NAME=data_kim2014 +DOWNLOAD_NAME=data_systemhcatlas SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" @@ -26,9 +26,9 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/systemhc.20171121.combined.csv.bz2 + +mv systemhc.20171121.combined.csv.bz2 data.csv.bz2 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/data_systemhcatlas/README.md b/downloads-generation/data_systemhcatlas/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5e66d9af8c400d8cceeaa4762bbb43bbea0493f8 --- /dev/null +++ b/downloads-generation/data_systemhcatlas/README.md @@ -0,0 +1,10 @@ +# SysteMHC database dump + +This is a data dump of the [SysteMHC Atlas](https://systemhcatlas.org/) provided +by personal communication. It is distributed under the ODC Open Database License. + +To generate this download run: + +``` +./GENERATE.sh +``` \ No newline at end of file diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1_no_mass_spec/GENERATE.sh similarity index 97% rename from downloads-generation/models_class1/GENERATE.sh rename to downloads-generation/models_class1_no_mass_spec/GENERATE.sh index b72334b536cca453300b52257eb8e9c65c7e0dd3..3b80f704deb4975a543d030a437e92bb3afd0265 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1_no_mass_spec/GENERATE.sh @@ -32,7 +32,7 @@ mkdir models cp $SCRIPT_DIR/hyperparameters.yaml . time mhcflurry-class1-train-allele-specific-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \ --hyperparameters hyperparameters.yaml \ --out-models-dir models \ --percent-rank-calibration-num-peptides-per-length 1000000 \ diff --git a/downloads-generation/models_class1/README.md b/downloads-generation/models_class1_no_mass_spec/README.md similarity index 100% rename from downloads-generation/models_class1/README.md rename to downloads-generation/models_class1_no_mass_spec/README.md diff --git a/downloads-generation/models_class1/hyperparameters.test.json b/downloads-generation/models_class1_no_mass_spec/hyperparameters.test.json similarity index 100% rename from downloads-generation/models_class1/hyperparameters.test.json rename to downloads-generation/models_class1_no_mass_spec/hyperparameters.test.json diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1_no_mass_spec/hyperparameters.yaml similarity index 100% rename from downloads-generation/models_class1/hyperparameters.yaml rename to downloads-generation/models_class1_no_mass_spec/hyperparameters.yaml diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 70fc552e4178aa6aaf45830be13ebea69e6b2386..5b0be59fcb2c3dce4e224df58155ffc52f5bda97 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -20,7 +20,7 @@ releases: 1.1.0: compatibility-version: 2 downloads: - - name: models_class1 + - name: models_class1_no_mass_spec url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2 default: true @@ -36,12 +36,16 @@ releases: url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 default: false - - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + - name: data_published + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_published.tar.bz2 + default: false + + - name: data_systemhcatlas + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_systemhcatlas.tar.bz2 default: false - name: data_curated - url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 + url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_curated.tar.bz2 default: true 1.0.0: diff --git a/setup.py b/setup.py index 0a2f0e49f105d8b18278eb7031ec1e8d163c32ba..a66f0d8255cb82c5f978d1d019004797dd1f11b9 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ try: import pypandoc readme = pypandoc.convert(readme, to='rst', format='md') except: - logging.warn("Conversion of long_description from MD to RST failed") + logging.warning("Conversion of long_description from MD to RST failed") pass with open('mhcflurry/__init__.py', 'r') as f: