Skip to content
Snippets Groups Projects
Commit 1a905bc8 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

First cut of mass-spec support in released models

parent 1e8de93d
No related merge requests found
Showing
with 194 additions and 37 deletions
......@@ -34,10 +34,23 @@ time python curate.py \
--data-iedb \
"$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \
--data-kim2014 \
"$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \
--out-csv curated_training_data.csv
"$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \
--out-csv curated_training_data.no_mass_spec.csv
time python curate.py \
--data-iedb \
"$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \
--data-kim2014 \
"$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \
--data-systemhc-atlas \
"$(mhcflurry-downloads path data_systemhcatlas)/data.csv.bz2" \
--data-abelin-mass-spec \
"$(mhcflurry-downloads path data_published)/abelin2017.hits.csv.bz2" \
--out-csv curated_training_data.with_mass_spec.csv
bzip2 curated_training_data.no_mass_spec.csv
bzip2 curated_training_data.with_mass_spec.csv
bzip2 curated_training_data.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
......
......@@ -29,14 +29,30 @@ parser.add_argument(
action="append",
default=[],
help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)")
parser.add_argument(
"--data-systemhc-atlas",
action="append",
default=[],
help="Path to systemhc-atlas-style mass-spec data")
parser.add_argument(
"--data-abelin-mass-spec",
action="append",
default=[],
help="Path to Abelin Immunity 2017 mass-spec hits")
parser.add_argument(
"--include-mass-spec",
action="store_true",
default=False,
help="Include mass-spec observations in IEDB")
parser.add_argument(
"--out-csv",
required=True,
help="Result file")
QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = {
"Negative": (20000.0, ">"),
"Positive": (500.0, "<"),
"Negative": (5000.0, ">"),
"Positive": (500.0, "<"), # used for mass-spec hits
"Positive-High": (100.0, "<"),
"Positive-Intermediate": (1000.0, "<"),
"Positive-Low": (5000.0, "<"),
......@@ -76,7 +92,58 @@ def load_data_kim2014(filename):
return df
def load_data_iedb(iedb_csv, include_qualitative=True):
def load_data_systemhc_atlas(filename, min_probability=0.99):
df = pandas.read_csv(filename)
print("Loaded systemhc atlas data: %s" % str(df.shape))
df["measurement_source"] = "systemhc-atlas"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.top_allele
df["peptide"] = df.search_hit
df["allele"] = df.top_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Dropping data points with probability < %f" % min_probability)
df = df.loc[df.prob >= min_probability]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Systemhc atlas data now: %s" % str(df.shape))
return df
def load_data_abelin_mass_spec(filename):
df = pandas.read_csv(filename)
print("Loaded Abelin mass-spec data: %s" % str(df.shape))
df["measurement_source"] = "abelin-mass-spec"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.allele
df["allele"] = df.original_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Abelin mass-spec data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Abelin mass-spec data now: %s" % str(df.shape))
return df
def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False):
iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
print("Loaded iedb data: %s" % str(iedb_df.shape))
......@@ -110,9 +177,10 @@ def load_data_iedb(iedb_csv, include_qualitative=True):
qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy()
qualitative["measurement_type"] = "qualitative"
print("Qualitative measurements: %d" % len(qualitative))
#qualitative = qualitative.ix[
# (~qualitative["Method/Technique"].str.contains("mass spec"))
#].copy()
if not include_mass_spec:
qualitative = qualitative.ix[
(~qualitative["Method/Technique"].str.contains("mass spec"))
].copy()
qualitative["Quantitative measurement"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY))
......@@ -169,7 +237,7 @@ def run():
dfs = []
for filename in args.data_iedb:
df = load_data_iedb(filename)
df = load_data_iedb(filename, include_mass_spec=args.include_mass_spec)
dfs.append(df)
for filename in args.data_kim2014:
df = load_data_kim2014(filename)
......@@ -185,8 +253,20 @@ def run():
]
print("Kim2014 data now: %s" % str(df.shape))
dfs.append(df)
for filename in args.data_systemhc_atlas:
df = load_data_systemhc_atlas(filename)
dfs.append(df)
for filename in args.data_abelin_mass_spec:
df = load_data_abelin_mass_spec(filename)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Combined df: %s" % (str(df.shape)))
print("Removing combined duplicates")
df = df.drop_duplicates(["allele", "peptide", "measurement_value"])
print("New combined df: %s" % (str(df.shape)))
df = df[[
"allele",
"peptide",
......@@ -197,7 +277,7 @@ def run():
"original_allele",
]].sort_values(["allele", "peptide"]).dropna()
print("Combined df: %s" % (str(df.shape)))
print("Final combined df: %s" % (str(df.shape)))
df.to_csv(args.out_csv, index=False)
print("Wrote: %s" % args.out_csv)
......
# Kim 2014 Data
This download contains the BD2009, BD2013, and BLIND datasets from [Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). BD2013 (augmented with more recent data from IEDB) are used to train the production MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data.
These files are available on dropbox here:
* https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
* https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
* https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
To generate this download run:
```
./GENERATE.sh
```
\ No newline at end of file
#!/bin/bash
#
# Download some published MHC I ligand data
#
#
set -e
set -x
DOWNLOAD_NAME=data_published
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
# git rev-parse HEAD
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
# Download kim2014 data
wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt
wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt
wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt
# Download abelin et al 2017 data
wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Published datasets
These datasets are derived from publications and do not change.
To generate this download run:
```
./GENERATE.sh
```
## Kim 2014
This download contains the BD2009, BD2013, and BLIND datasets from
[Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241).
BD2013 (augmented with more recent data from IEDB) are used to train the production
MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data.
## Abelin et al. Immunity 2017
This download contains the peptides identified in
[Mass Spectrometry Profiling of HLA-Associated Peptidomes in Mono-allelic Cells Enables More Accurate Epitope Prediction](https://www.ncbi.nlm.nih.gov/pubmed/28228285).
#!/bin/bash
#
# Download some published MHC I ligand data from a location on Dropbox.
# Download some published MHC I ligands identified by mass-spec
#
#
set -e
set -x
DOWNLOAD_NAME=data_kim2014
DOWNLOAD_NAME=data_systemhcatlas
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
......@@ -26,9 +26,9 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/systemhc.20171121.combined.csv.bz2
mv systemhc.20171121.combined.csv.bz2 data.csv.bz2
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
......
# SysteMHC database dump
This is a data dump of the [SysteMHC Atlas](https://systemhcatlas.org/) provided
by personal communication. It is distributed under the ODC Open Database License.
To generate this download run:
```
./GENERATE.sh
```
\ No newline at end of file
......@@ -32,7 +32,7 @@ mkdir models
cp $SCRIPT_DIR/hyperparameters.yaml .
time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \
--hyperparameters hyperparameters.yaml \
--out-models-dir models \
--percent-rank-calibration-num-peptides-per-length 1000000 \
......
......@@ -20,7 +20,7 @@ releases:
1.1.0:
compatibility-version: 2
downloads:
- name: models_class1
- name: models_class1_no_mass_spec
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2
default: true
......@@ -36,12 +36,16 @@ releases:
url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2
default: false
- name: data_kim2014
url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2
- name: data_published
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_published.tar.bz2
default: false
- name: data_systemhcatlas
url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_systemhcatlas.tar.bz2
default: false
- name: data_curated
url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2
url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/data_curated.tar.bz2
default: true
1.0.0:
......
......@@ -37,7 +37,7 @@ try:
import pypandoc
readme = pypandoc.convert(readme, to='rst', format='md')
except:
logging.warn("Conversion of long_description from MD to RST failed")
logging.warning("Conversion of long_description from MD to RST failed")
pass
with open('mhcflurry/__init__.py', 'r') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment