First cut of mass-spec support in released models

1a905bc8 · Tim O'Donnell · 1e8de93d · 1a905bc8 · 1a905bc8 · 1e8de93d
Commit 1a905bc8 authored 7 years ago by Tim O'Donnell
--- a/downloads-generation/data_curated/GENERATE.sh
+++ b/downloads-generation/data_curated/GENERATE.sh
@@ -34,10 +34,23 @@ time python curate.py \
    --data-iedb \
        "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \
    --data-kim2014 \
-        "$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \
-    --out-csv curated_training_data.csv
+        "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \
+    --out-csv curated_training_data.no_mass_spec.csv
+
+time python curate.py \
+    --data-iedb \
+        "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \
+    --data-kim2014 \
+        "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \
+    --data-systemhc-atlas \
+        "$(mhcflurry-downloads path data_systemhcatlas)/data.csv.bz2" \
+    --data-abelin-mass-spec \
+        "$(mhcflurry-downloads path data_published)/abelin2017.hits.csv.bz2" \
+    --out-csv curated_training_data.with_mass_spec.csv
+
+bzip2 curated_training_data.no_mass_spec.csv
+bzip2 curated_training_data.with_mass_spec.csv

-bzip2 curated_training_data.csv
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
 tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *

--- a/downloads-generation/data_curated/curate.py
+++ b/downloads-generation/data_curated/curate.py
@@ -29,14 +29,30 @@ parser.add_argument(
    action="append",
    default=[],
    help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)")
+parser.add_argument(
+    "--data-systemhc-atlas",
+    action="append",
+    default=[],
+    help="Path to systemhc-atlas-style mass-spec data")
+parser.add_argument(
+    "--data-abelin-mass-spec",
+    action="append",
+    default=[],
+    help="Path to Abelin Immunity 2017 mass-spec hits")
+parser.add_argument(
+    "--include-mass-spec",
+    action="store_true",
+    default=False,
+    help="Include mass-spec observations in IEDB")
+
 parser.add_argument(
    "--out-csv",
    required=True,
    help="Result file")

 QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = {
-    "Negative": (20000.0, ">"),
-    "Positive": (500.0, "<"),
+    "Negative": (5000.0, ">"),
+    "Positive": (500.0, "<"),  # used for mass-spec hits
    "Positive-High": (100.0, "<"),
    "Positive-Intermediate": (1000.0, "<"),
    "Positive-Low": (5000.0, "<"),
@@ -76,7 +92,58 @@ def load_data_kim2014(filename):
    return df


-def load_data_iedb(iedb_csv, include_qualitative=True):
+def load_data_systemhc_atlas(filename, min_probability=0.99):
+    df = pandas.read_csv(filename)
+    print("Loaded systemhc atlas data: %s" % str(df.shape))
+
+    df["measurement_source"] = "systemhc-atlas"
+    df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
+    df["measurement_inequality"] = "<"
+    df["measurement_type"] = "qualitative"
+    df["original_allele"] = df.top_allele
+    df["peptide"] = df.search_hit
+    df["allele"] = df.top_allele.map(normalize_allele_name)
+
+    print("Dropping un-parseable alleles: %s" % ", ".join(
+        str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique()))
+    df = df.loc[df.allele != "UNKNOWN"]
+    print("Systemhc atlas data now: %s" % str(df.shape))
+
+    print("Dropping data points with probability < %f" % min_probability)
+    df = df.loc[df.prob >= min_probability]
+    print("Systemhc atlas data now: %s" % str(df.shape))
+
+    print("Removing duplicates")
+    df = df.drop_duplicates(["allele", "peptide"])
+    print("Systemhc atlas data now: %s" % str(df.shape))
+
+    return df
+
+
+def load_data_abelin_mass_spec(filename):
+    df = pandas.read_csv(filename)
+    print("Loaded Abelin mass-spec data: %s" % str(df.shape))
+
+    df["measurement_source"] = "abelin-mass-spec"
+    df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
+    df["measurement_inequality"] = "<"
+    df["measurement_type"] = "qualitative"
+    df["original_allele"] = df.allele
+    df["allele"] = df.original_allele.map(normalize_allele_name)
+
+    print("Dropping un-parseable alleles: %s" % ", ".join(
+        str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique()))
+    df = df.loc[df.allele != "UNKNOWN"]
+    print("Abelin mass-spec data now: %s" % str(df.shape))
+
+    print("Removing duplicates")
+    df = df.drop_duplicates(["allele", "peptide"])
+    print("Abelin mass-spec data now: %s" % str(df.shape))
+
+    return df
+
+
+def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False):
    iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
    print("Loaded iedb data: %s" % str(iedb_df.shape))

@@ -110,9 +177,10 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
    qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy()
    qualitative["measurement_type"] = "qualitative"
    print("Qualitative measurements: %d" % len(qualitative))
-    #qualitative = qualitative.ix[
-    #    (~qualitative["Method/Technique"].str.contains("mass spec"))
-    #].copy()
+    if not include_mass_spec:
+        qualitative = qualitative.ix[
+            (~qualitative["Method/Technique"].str.contains("mass spec"))
+        ].copy()

    qualitative["Quantitative measurement"] = (
        qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY))
@@ -169,7 +237,7 @@ def run():

    dfs = []
    for filename in args.data_iedb:
-        df = load_data_iedb(filename)
+        df = load_data_iedb(filename, include_mass_spec=args.include_mass_spec)
        dfs.append(df)
    for filename in args.data_kim2014:
        df = load_data_kim2014(filename)
@@ -185,8 +253,20 @@ def run():
            ]
            print("Kim2014 data now: %s" % str(df.shape))
        dfs.append(df)
+    for filename in args.data_systemhc_atlas:
+        df = load_data_systemhc_atlas(filename)
+        dfs.append(df)
+    for filename in args.data_abelin_mass_spec:
+        df = load_data_abelin_mass_spec(filename)
+        dfs.append(df)

    df = pandas.concat(dfs, ignore_index=True)
+    print("Combined df: %s" % (str(df.shape)))
+
+    print("Removing combined duplicates")
+    df = df.drop_duplicates(["allele", "peptide", "measurement_value"])
+    print("New combined df: %s" % (str(df.shape)))
+
    df = df[[
        "allele",
        "peptide",
@@ -197,7 +277,7 @@ def run():
        "original_allele",
    ]].sort_values(["allele", "peptide"]).dropna()

-    print("Combined df: %s" % (str(df.shape)))
+    print("Final combined df: %s" % (str(df.shape)))

    df.to_csv(args.out_csv, index=False)
    print("Wrote: %s" % args.out_csv)

--- a/downloads-generation/data_kim2014/README.md
+++ b/downloads-generation/data_kim2014/README.md
-# Kim 2014 Data
-
-This download contains the BD2009, BD2013, and BLIND datasets from [Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). BD2013 (augmented with more recent data from IEDB) are used to train the production MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data.
-
-These files are available on dropbox here:
-
- * https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
- * https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
- * https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
-
-To generate this download run:
-
-```
-./GENERATE.sh
-```
\ No newline at end of file
--- a/downloads-generation/data_published/GENERATE.sh
+++ b/downloads-generation/data_published/GENERATE.sh
+#!/bin/bash
+#
+# Download some published MHC I ligand data
+#
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=data_published
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+# git rev-parse HEAD
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+# Download kim2014 data
+wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt
+wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
+wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
+
+# Download abelin et al 2017 data
+wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 LOG.txt
+tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
+
+echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/data_published/README.md
+++ b/downloads-generation/data_published/README.md
+# Published datasets
+
+These datasets are derived from publications and do not change.
+
+To generate this download run:
+
+```
+./GENERATE.sh
+```
+
+## Kim 2014
+
+This download contains the BD2009, BD2013, and BLIND datasets from
+[Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241).
+
+BD2013 (augmented with more recent data from IEDB) are used to train the production
+MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data.
+
+
+## Abelin et al. Immunity 2017
+
+This download contains the peptides identified in
+[Mass Spectrometry Profiling of HLA-Associated Peptidomes in Mono-allelic Cells Enables More Accurate Epitope Prediction](https://www.ncbi.nlm.nih.gov/pubmed/28228285).
+
--- a/downloads-generation/data_kim2014/GENERATE.sh
+++ b/downloads-generation/data_kim2014/GENERATE.sh
 #!/bin/bash
 #
-# Download some published MHC I ligand data from a location on Dropbox.
+# Download some published MHC I ligands identified by mass-spec
 #
 #
 set -e
 set -x

-DOWNLOAD_NAME=data_kim2014
+DOWNLOAD_NAME=data_systemhcatlas
 SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
 SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"

@@ -26,9 +26,9 @@ git status

 cd $SCRATCH_DIR/$DOWNLOAD_NAME

-wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
-wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
-wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
+wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/systemhc.20171121.combined.csv.bz2
+
+mv systemhc.20171121.combined.csv.bz2 data.csv.bz2

 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt

--- a/downloads-generation/data_systemhcatlas/README.md
+++ b/downloads-generation/data_systemhcatlas/README.md
+# SysteMHC database dump
+
+This is a data dump of the [SysteMHC Atlas](https://systemhcatlas.org/) provided
+by personal communication. It is distributed under the ODC Open Database License.
+
+To generate this download run:
+
+```
+./GENERATE.sh
+```
\ No newline at end of file
--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -32,7 +32,7 @@ mkdir models
 cp $SCRIPT_DIR/hyperparameters.yaml .

 time mhcflurry-class1-train-allele-specific-models \
-    --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
+    --data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \
    --hyperparameters hyperparameters.yaml \
    --out-models-dir models \
    --percent-rank-calibration-num-peptides-per-length 1000000 \

--- a/downloads-generation/models_class1/README.md
+++ b/downloads-generation/models_class1/README.md
--- a/downloads-generation/models_class1/hyperparameters.test.json
+++ b/downloads-generation/models_class1/hyperparameters.test.json
--- a/downloads-generation/models_class1/hyperparameters.yaml
+++ b/downloads-generation/models_class1/hyperparameters.yaml
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -20,7 +20,7 @@ releases:
    1.1.0:
        compatibility-version: 2
        downloads:
-            - name: models_class1
+            - name: models_class1_no_mass_spec
              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2
              default: true

@@ -36,12 +36,16 @@ releases:
              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2
              default: false

-            - name: data_kim2014
-              url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2
+            - name: data_published
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_published.tar.bz2
+              default: false
+
+            - name: data_systemhcatlas
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_systemhcatlas.tar.bz2
              default: false

            - name: data_curated
-              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2
+              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_curated.tar.bz2
              default: true

    1.0.0:

--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ try:
    import pypandoc
    readme = pypandoc.convert(readme, to='rst', format='md')
 except:
-    logging.warn("Conversion of long_description from MD to RST failed")
+    logging.warning("Conversion of long_description from MD to RST failed")
    pass

 with open('mhcflurry/__init__.py', 'r') as f: