Skip to content
Snippets Groups Projects
Unverified Commit fa7de70d authored by Tim O'Donnell's avatar Tim O'Donnell Committed by GitHub
Browse files

Merge pull request #133 from openvax/kim-evaluation

Kim evaluation
parents 44bd56e5 387346ac
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
#
set -x
DOWNLOAD_NAME=models_class1_kim_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/write_validation_data.py .
time python curate.py \
--data-kim2014 \
"$(mhcflurry-downloads path data_published)/bdata.2009.mhci.public.1.txt" \
--out-csv train.csv
bzip2 train.csv
mkdir models
cp $SCRIPT_DIR/class1_pseudosequences.csv .
python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
time mhcflurry-class1-train-allele-specific-models \
--data "train.csv.bz2" \
--allele-sequences class1_pseudosequences.csv \
--hyperparameters hyperparameters.yaml \
--out-models-dir models \
--held-out-fraction-reciprocal 10 \
--min-measurements-per-allele 20 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
time python ./write_validation_data.py \
--include "train.csv.bz2" \
--exclude "models/train_data.csv.bz2" \
--only-alleles-present-in-exclude \
--out-data test.csv \
--out-summary test.summary.csv
wc -l test.csv
mkdir selected-models
time mhcflurry-class1-select-allele-specific-models \
--data test.csv \
--models-dir models \
--out-models-dir selected-models \
--scoring combined:mse,consensus \
--consensus-num-peptides-per-length 10000 \
--combined-min-models 8 \
--combined-max-models 16 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 5
time mhcflurry-calibrate-percentile-ranks \
--models-dir selected-models \
--num-peptides-per-length 100000 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Kim benchmark
This download trains MHCflurry predictors using the BD2009 dataset from
[Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). The trained predictor can be evaluated on the BLIND test set from this publication.
We include this download as a resource for others wishing to benchmark new
binding predictors. It's also a working example of a complete MHCflurry
training and model selection run on a simple dataset. See
[GENERATE.sh](./GENERATE.sh).
This diff is collapsed.
"""
Filter and combine various peptide/MHC datasets to derive a composite training set,
optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
try:
(a,b,c) = s.split("-")
return mhcnames.normalize_allele_name(
"%s-%s*%s" % (a,b,c))
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--data-kim2014",
action="append",
default=[],
help="Path to Kim 2014-style affinity data")
parser.add_argument(
"--data-iedb",
action="append",
default=[],
help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)")
parser.add_argument(
"--data-systemhc-atlas",
action="append",
default=[],
help="Path to systemhc-atlas-style mass-spec data")
parser.add_argument(
"--data-abelin-mass-spec",
action="append",
default=[],
help="Path to Abelin Immunity 2017 mass-spec hits")
parser.add_argument(
"--include-iedb-mass-spec",
action="store_true",
default=False,
help="Include mass-spec observations in IEDB")
parser.add_argument(
"--out-csv",
required=True,
help="Result file")
QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = {
"Negative": (5000.0, ">"),
"Positive": (500.0, "<"), # used for mass-spec hits
"Positive-High": (100.0, "<"),
"Positive-Intermediate": (1000.0, "<"),
"Positive-Low": (5000.0, "<"),
}
QUALITATIVE_TO_AFFINITY = dict(
(key, value[0]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
QUALITATIVE_TO_INEQUALITY = dict(
(key, value[1]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
EXCLUDE_IEDB_ALLELES = [
"HLA class I",
"HLA class II",
]
def load_data_kim2014(filename):
df = pandas.read_table(filename)
print("Loaded kim2014 data: %s" % str(df.shape))
df["measurement_source"] = "kim2014"
df["measurement_value"] = df.meas
df["measurement_type"] = (df.inequality == "=").map({
True: "quantitative",
False: "qualitative",
})
df["measurement_inequality"] = df.inequality
df["original_allele"] = df.mhc
df["peptide"] = df.sequence
df["allele"] = df.mhc.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
df.ix[df.allele == "UNKNOWN"]["mhc"].unique()))
df = df.ix[df.allele != "UNKNOWN"]
print("Loaded kim2014 data: %s" % str(df.shape))
return df
def load_data_systemhc_atlas(filename, min_probability=0.99):
df = pandas.read_csv(filename)
print("Loaded systemhc atlas data: %s" % str(df.shape))
df["measurement_source"] = "systemhc-atlas"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.top_allele
df["peptide"] = df.search_hit
df["allele"] = df.top_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Dropping data points with probability < %f" % min_probability)
df = df.loc[df.prob >= min_probability]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Systemhc atlas data now: %s" % str(df.shape))
return df
def load_data_abelin_mass_spec(filename):
df = pandas.read_csv(filename)
print("Loaded Abelin mass-spec data: %s" % str(df.shape))
df["measurement_source"] = "abelin-mass-spec"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.allele
df["allele"] = df.original_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Abelin mass-spec data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Abelin mass-spec data now: %s" % str(df.shape))
return df
def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False):
iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
print("Loaded iedb data: %s" % str(iedb_df.shape))
print("Selecting only class I")
iedb_df = iedb_df.ix[
iedb_df["MHC allele class"].str.strip().str.upper() == "I"
]
print("New shape: %s" % str(iedb_df.shape))
print("Dropping known unusuable alleles")
iedb_df = iedb_df.ix[
~iedb_df["Allele Name"].isin(EXCLUDE_IEDB_ALLELES)
]
iedb_df = iedb_df.ix[
(~iedb_df["Allele Name"].str.contains("mutant")) &
(~iedb_df["Allele Name"].str.contains("CD1"))
]
iedb_df["allele"] = iedb_df["Allele Name"].map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
iedb_df.ix[iedb_df.allele == "UNKNOWN"]["Allele Name"].unique()))
iedb_df = iedb_df.ix[iedb_df.allele != "UNKNOWN"]
print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts())
quantitative = iedb_df.ix[iedb_df["Units"] == "nM"].copy()
quantitative["measurement_type"] = "quantitative"
quantitative["measurement_inequality"] = "="
print("Quantitative measurements: %d" % len(quantitative))
qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy()
qualitative["measurement_type"] = "qualitative"
print("Qualitative measurements: %d" % len(qualitative))
if not include_mass_spec:
qualitative = qualitative.ix[
(~qualitative["Method/Technique"].str.contains("mass spec"))
].copy()
qualitative["Quantitative measurement"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY))
qualitative["measurement_inequality"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_INEQUALITY))
print("Qualitative measurements (possibly after dropping MS): %d" % (
len(qualitative)))
iedb_df = pandas.concat(
(
([quantitative]) +
([qualitative] if include_qualitative else [])),
ignore_index=True)
print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts())
print("Subselecting to valid peptides. Starting with: %d" % len(iedb_df))
iedb_df["Description"] = iedb_df.Description.str.strip()
iedb_df = iedb_df.ix[
iedb_df.Description.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")
]
print("Now: %d" % len(iedb_df))
print("Annotating last author and category")
iedb_df["last_author"] = iedb_df.Authors.map(
lambda x: (
x.split(";")[-1]
.split(",")[-1]
.split(" ")[-1]
.strip()
.replace("*", ""))).values
iedb_df["category"] = (
iedb_df["last_author"] + " - " + iedb_df["Method/Technique"]).values
train_data = pandas.DataFrame()
train_data["peptide"] = iedb_df.Description.values
train_data["measurement_value"] = iedb_df[
"Quantitative measurement"
].values
train_data["measurement_source"] = iedb_df.category.values
train_data["measurement_inequality"] = iedb_df.measurement_inequality.values
train_data["allele"] = iedb_df["allele"].values
train_data["original_allele"] = iedb_df["Allele Name"].values
train_data["measurement_type"] = iedb_df["measurement_type"].values
train_data = train_data.drop_duplicates().reset_index(drop=True)
return train_data
def run():
args = parser.parse_args(sys.argv[1:])
dfs = []
for filename in args.data_iedb:
df = load_data_iedb(filename, include_mass_spec=args.include_iedb_mass_spec)
dfs.append(df)
for filename in args.data_kim2014:
df = load_data_kim2014(filename)
df["allele_peptide"] = df.allele + "_" + df.peptide
# Give precedence to IEDB data.
if dfs:
iedb_df = dfs[0]
iedb_df["allele_peptide"] = iedb_df.allele + "_" + iedb_df.peptide
print("Dropping kim2014 data present in IEDB.")
df = df.ix[
~df.allele_peptide.isin(iedb_df.allele_peptide)
]
print("Kim2014 data now: %s" % str(df.shape))
dfs.append(df)
for filename in args.data_systemhc_atlas:
df = load_data_systemhc_atlas(filename)
dfs.append(df)
for filename in args.data_abelin_mass_spec:
df = load_data_abelin_mass_spec(filename)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Combined df: %s" % (str(df.shape)))
print("Removing combined duplicates")
df = df.drop_duplicates(["allele", "peptide", "measurement_value"])
print("New combined df: %s" % (str(df.shape)))
df = df[[
"allele",
"peptide",
"measurement_value",
"measurement_inequality",
"measurement_type",
"measurement_source",
"original_allele",
]].sort_values(["allele", "peptide"]).dropna()
print("Final combined df: %s" % (str(df.shape)))
df.to_csv(args.out_csv, index=False)
print("Wrote: %s" % args.out_csv)
if __name__ == '__main__':
run()
"""
Generate grid of hyperparameters
"""
from sys import stdout
from copy import deepcopy
from yaml import dump
base_hyperparameters = {
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 4,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 20,
"early_stopping": True,
"validation_split": 0.1,
"minibatch_size": None,
"loss": "custom:mse_with_inequalities",
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": False, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "tanh",
"output_activation": "sigmoid",
"layer_sizes": [16],
"dense_layer_l1_regularization": None,
"batch_normalization": False,
"dropout_probability": 0.0,
##########################################
# TRAINING Data
##########################################
"train_data": {"subset": "all", "pretrain_min_points": 1000},
}
grid = []
for train_subset in ["all", "quantitative"]:
for minibatch_size in [128]:
for dense_layer_size in [8, 16, 32, 64]:
for l1 in [0.0, 0.001]:
for num_lc in [0, 1, 2]:
for lc_kernel_size in [3, 5]:
new = deepcopy(base_hyperparameters)
new["minibatch_size"] = minibatch_size
new["train_data"]["subset"] = train_subset
new["layer_sizes"] = [dense_layer_size]
new["dense_layer_l1_regularization"] = l1
(lc_layer,) = new["locally_connected_layers"]
lc_layer['kernel_size'] = lc_kernel_size
if num_lc == 0:
new["locally_connected_layers"] = []
elif num_lc == 1:
new["locally_connected_layers"] = [lc_layer]
elif num_lc == 2:
new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)]
if not grid or new not in grid:
grid.append(new)
dump(grid, stdout)
"""
Write and summarize model validation data, which is obtained by taking a full
dataset and removing the data used for training.
"""
import argparse
import sys
from os.path import abspath
import pandas
import numpy
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"--include",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to include")
parser.add_argument(
"--exclude",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to exclude")
parser.add_argument(
"--out-data",
metavar="RESULT.csv",
help="Output dadta CSV")
parser.add_argument(
"--out-summary",
metavar="RESULT.csv",
help="Output summary CSV")
parser.add_argument(
"--mass-spec-regex",
metavar="REGEX",
default="mass[- ]spec",
help="Regular expression for mass-spec data. Runs on measurement_source col."
"Default: %(default)s.")
parser.add_argument(
"--only-alleles-present-in-exclude",
action="store_true",
default=False,
help="Filter to only alleles that are present in files given by --exclude. "
"Useful for filtering to only alleles supported by a predictor, where the "
"training data for the predictor is given by --exclude.")
def run(argv):
args = parser.parse_args(argv)
dfs = []
for input in args.include:
df = pandas.read_csv(input)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Loaded data with shape: %s" % str(df.shape))
del dfs
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
if args.exclude:
exclude_dfs = []
for exclude in args.exclude:
exclude_df = pandas.read_csv(exclude)
exclude_dfs.append(exclude_df)
exclude_df = pandas.concat(exclude_dfs, ignore_index=True)
del exclude_dfs
df["_key"] = df.allele + "__" + df.peptide
exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
df["_excluded"] = df._key.isin(exclude_df._key.unique())
print("Excluding measurements per allele (counts): ")
print(df.groupby("allele")._excluded.sum())
print("Excluding measurements per allele (fractions): ")
print(df.groupby("allele")._excluded.mean())
df = df.loc[~df._excluded]
del df["_excluded"]
del df["_key"]
if args.only_alleles_present_in_exclude:
df = df.loc[df.allele.isin(exclude_df.allele.unique())]
df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex)
df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec"
if args.out_summary:
summary_df = df.groupby(
["allele", "measurement_inequality"]
)["measurement_value"].count().unstack().fillna(0).astype(int)
summary_df["total"] = summary_df.sum(1)
summary_df.to_csv(args.out_summary)
print("Wrote: %s" % args.out_summary)
if args.out_data:
df.to_csv(args.out_data, index=False)
print("Wrote: %s" % args.out_data)
if __name__ == '__main__':
run(sys.argv[1:])
......@@ -144,7 +144,7 @@ def run(argv=sys.argv[1:]):
configure_logging(verbose=args.verbosity > 1)
hyperparameters_lst = yaml.load(open(args.hyperparameters))
assert isinstance(hyperparameters_lst, list)
assert isinstance(hyperparameters_lst, list), hyperparameters_lst
print("Loaded hyperparameters list: %s" % str(hyperparameters_lst))
df = pandas.read_csv(args.data)
......@@ -171,8 +171,8 @@ def run(argv=sys.argv[1:]):
].index)
# Allele names in data are assumed to be already normalized.
print("Selected %d/%d alleles: %s" % (len(alleles), df.allele.nunique(), ' '.join(alleles)))
df = df.loc[df.allele.isin(alleles)].dropna()
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
if args.held_out_fraction_reciprocal:
df = subselect_df_held_out(
......@@ -227,6 +227,7 @@ def run(argv=sys.argv[1:]):
allele_sequences = allele_sequences.loc[
allele_sequences.index.isin(df.allele.unique())
]
print("Allele sequences matching train data: %d" % len(allele_sequences))
blosum_encoding = (
AlleleEncoding(
allele_sequences.index.values,
......@@ -377,7 +378,6 @@ def train_model(
alleles = []
while not alleles or data_size_by_allele.loc[alleles].sum() < pretrain_min_points:
alleles.append(similar_alleles.pop(0))
print(alleles)
data = data.loc[data.allele.isin(alleles)]
assert len(data) >= pretrain_min_points, (len(data), pretrain_min_points)
train_rounds = (data.allele == allele).astype(int).values
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment