Skip to content
Snippets Groups Projects
Commit 3dd8150e authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

First cut on Kim 2014 benchmark script

parent 44bd56e5
No related merge requests found
#!/bin/bash
#
set -x
DOWNLOAD_NAME=models_class1_kim_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/curate.py .
cp $SCRIPT_DIR/write_validation_data.py .
time python curate.py \
--data-kim2014 \
"$(mhcflurry-downloads path data_published)/bdata.2009.mhci.public.1.txt" \
--out-csv train.csv
bzip2 train.csv
mkdir models
cp $SCRIPT_DIR/class1_pseudosequences.csv .
python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
time mhcflurry-class1-train-allele-specific-models \
--data "train.csv.bz2" \
--allele-sequences class1_pseudosequences.csv \
--hyperparameters hyperparameters.yaml \
--out-models-dir models \
--held-out-fraction-reciprocal 10 \
--min-measurements-per-allele 20 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
time python ./write_validation_data.py \
--include "train.csv.bz2" \
--exclude "models/train_data.csv.bz2" \
--only-alleles-present-in-exclude \
--out-data test.csv \
--out-summary test.summary.csv
wc -l test.csv
mkdir selected-models
time mhcflurry-class1-select-allele-specific-models \
--data test.csv \
--models-dir models \
--out-models-dir selected-models \
--scoring combined:mse,consensus \
--consensus-num-peptides-per-length 10000 \
--combined-min-models 8 \
--combined-max-models 16 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 5
time mhcflurry-calibrate-percentile-ranks \
--models-dir selected-models \
--num-peptides-per-length 100000 \
--num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
This diff is collapsed.
"""
Filter and combine various peptide/MHC datasets to derive a composite training set,
optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
try:
(a,b,c) = s.split("-")
return mhcnames.normalize_allele_name(
"%s-%s*%s" % (a,b,c))
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--data-kim2014",
action="append",
default=[],
help="Path to Kim 2014-style affinity data")
parser.add_argument(
"--data-iedb",
action="append",
default=[],
help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)")
parser.add_argument(
"--data-systemhc-atlas",
action="append",
default=[],
help="Path to systemhc-atlas-style mass-spec data")
parser.add_argument(
"--data-abelin-mass-spec",
action="append",
default=[],
help="Path to Abelin Immunity 2017 mass-spec hits")
parser.add_argument(
"--include-iedb-mass-spec",
action="store_true",
default=False,
help="Include mass-spec observations in IEDB")
parser.add_argument(
"--out-csv",
required=True,
help="Result file")
QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = {
"Negative": (5000.0, ">"),
"Positive": (500.0, "<"), # used for mass-spec hits
"Positive-High": (100.0, "<"),
"Positive-Intermediate": (1000.0, "<"),
"Positive-Low": (5000.0, "<"),
}
QUALITATIVE_TO_AFFINITY = dict(
(key, value[0]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
QUALITATIVE_TO_INEQUALITY = dict(
(key, value[1]) for (key, value)
in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items())
EXCLUDE_IEDB_ALLELES = [
"HLA class I",
"HLA class II",
]
def load_data_kim2014(filename):
df = pandas.read_table(filename)
print("Loaded kim2014 data: %s" % str(df.shape))
df["measurement_source"] = "kim2014"
df["measurement_value"] = df.meas
df["measurement_type"] = (df.inequality == "=").map({
True: "quantitative",
False: "qualitative",
})
df["measurement_inequality"] = df.inequality
df["original_allele"] = df.mhc
df["peptide"] = df.sequence
df["allele"] = df.mhc.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
df.ix[df.allele == "UNKNOWN"]["mhc"].unique()))
df = df.ix[df.allele != "UNKNOWN"]
print("Loaded kim2014 data: %s" % str(df.shape))
return df
def load_data_systemhc_atlas(filename, min_probability=0.99):
df = pandas.read_csv(filename)
print("Loaded systemhc atlas data: %s" % str(df.shape))
df["measurement_source"] = "systemhc-atlas"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.top_allele
df["peptide"] = df.search_hit
df["allele"] = df.top_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Dropping data points with probability < %f" % min_probability)
df = df.loc[df.prob >= min_probability]
print("Systemhc atlas data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Systemhc atlas data now: %s" % str(df.shape))
return df
def load_data_abelin_mass_spec(filename):
df = pandas.read_csv(filename)
print("Loaded Abelin mass-spec data: %s" % str(df.shape))
df["measurement_source"] = "abelin-mass-spec"
df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"]
df["measurement_inequality"] = "<"
df["measurement_type"] = "qualitative"
df["original_allele"] = df.allele
df["allele"] = df.original_allele.map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique()))
df = df.loc[df.allele != "UNKNOWN"]
print("Abelin mass-spec data now: %s" % str(df.shape))
print("Removing duplicates")
df = df.drop_duplicates(["allele", "peptide"])
print("Abelin mass-spec data now: %s" % str(df.shape))
return df
def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False):
iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False)
print("Loaded iedb data: %s" % str(iedb_df.shape))
print("Selecting only class I")
iedb_df = iedb_df.ix[
iedb_df["MHC allele class"].str.strip().str.upper() == "I"
]
print("New shape: %s" % str(iedb_df.shape))
print("Dropping known unusuable alleles")
iedb_df = iedb_df.ix[
~iedb_df["Allele Name"].isin(EXCLUDE_IEDB_ALLELES)
]
iedb_df = iedb_df.ix[
(~iedb_df["Allele Name"].str.contains("mutant")) &
(~iedb_df["Allele Name"].str.contains("CD1"))
]
iedb_df["allele"] = iedb_df["Allele Name"].map(normalize_allele_name)
print("Dropping un-parseable alleles: %s" % ", ".join(
iedb_df.ix[iedb_df.allele == "UNKNOWN"]["Allele Name"].unique()))
iedb_df = iedb_df.ix[iedb_df.allele != "UNKNOWN"]
print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts())
quantitative = iedb_df.ix[iedb_df["Units"] == "nM"].copy()
quantitative["measurement_type"] = "quantitative"
quantitative["measurement_inequality"] = "="
print("Quantitative measurements: %d" % len(quantitative))
qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy()
qualitative["measurement_type"] = "qualitative"
print("Qualitative measurements: %d" % len(qualitative))
if not include_mass_spec:
qualitative = qualitative.ix[
(~qualitative["Method/Technique"].str.contains("mass spec"))
].copy()
qualitative["Quantitative measurement"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY))
qualitative["measurement_inequality"] = (
qualitative["Qualitative Measure"].map(QUALITATIVE_TO_INEQUALITY))
print("Qualitative measurements (possibly after dropping MS): %d" % (
len(qualitative)))
iedb_df = pandas.concat(
(
([quantitative]) +
([qualitative] if include_qualitative else [])),
ignore_index=True)
print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts())
print("Subselecting to valid peptides. Starting with: %d" % len(iedb_df))
iedb_df["Description"] = iedb_df.Description.str.strip()
iedb_df = iedb_df.ix[
iedb_df.Description.str.match("^[ACDEFGHIKLMNPQRSTVWY]+$")
]
print("Now: %d" % len(iedb_df))
print("Annotating last author and category")
iedb_df["last_author"] = iedb_df.Authors.map(
lambda x: (
x.split(";")[-1]
.split(",")[-1]
.split(" ")[-1]
.strip()
.replace("*", ""))).values
iedb_df["category"] = (
iedb_df["last_author"] + " - " + iedb_df["Method/Technique"]).values
train_data = pandas.DataFrame()
train_data["peptide"] = iedb_df.Description.values
train_data["measurement_value"] = iedb_df[
"Quantitative measurement"
].values
train_data["measurement_source"] = iedb_df.category.values
train_data["measurement_inequality"] = iedb_df.measurement_inequality.values
train_data["allele"] = iedb_df["allele"].values
train_data["original_allele"] = iedb_df["Allele Name"].values
train_data["measurement_type"] = iedb_df["measurement_type"].values
train_data = train_data.drop_duplicates().reset_index(drop=True)
return train_data
def run():
args = parser.parse_args(sys.argv[1:])
dfs = []
for filename in args.data_iedb:
df = load_data_iedb(filename, include_mass_spec=args.include_iedb_mass_spec)
dfs.append(df)
for filename in args.data_kim2014:
df = load_data_kim2014(filename)
df["allele_peptide"] = df.allele + "_" + df.peptide
# Give precedence to IEDB data.
if dfs:
iedb_df = dfs[0]
iedb_df["allele_peptide"] = iedb_df.allele + "_" + iedb_df.peptide
print("Dropping kim2014 data present in IEDB.")
df = df.ix[
~df.allele_peptide.isin(iedb_df.allele_peptide)
]
print("Kim2014 data now: %s" % str(df.shape))
dfs.append(df)
for filename in args.data_systemhc_atlas:
df = load_data_systemhc_atlas(filename)
dfs.append(df)
for filename in args.data_abelin_mass_spec:
df = load_data_abelin_mass_spec(filename)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Combined df: %s" % (str(df.shape)))
print("Removing combined duplicates")
df = df.drop_duplicates(["allele", "peptide", "measurement_value"])
print("New combined df: %s" % (str(df.shape)))
df = df[[
"allele",
"peptide",
"measurement_value",
"measurement_inequality",
"measurement_type",
"measurement_source",
"original_allele",
]].sort_values(["allele", "peptide"]).dropna()
print("Final combined df: %s" % (str(df.shape)))
df.to_csv(args.out_csv, index=False)
print("Wrote: %s" % args.out_csv)
if __name__ == '__main__':
run()
"""
Generate grid of hyperparameters
"""
from sys import stdout
from copy import deepcopy
from yaml import dump
base_hyperparameters = {
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 4,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 20,
"early_stopping": True,
"validation_split": 0.1,
"minibatch_size": None,
"loss": "custom:mse_with_inequalities",
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": False, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "tanh",
"output_activation": "sigmoid",
"layer_sizes": [16],
"dense_layer_l1_regularization": None,
"batch_normalization": False,
"dropout_probability": 0.0,
##########################################
# TRAINING Data
##########################################
"train_data": {"subset": "all", "pretrain_min_points": 1000},
}
grid = []
for train_subset in ["all", "quantitative"]:
for minibatch_size in [128]:
for dense_layer_size in [8, 16, 32, 64]:
for l1 in [0.0, 0.001]:
for num_lc in [0, 1, 2]:
for lc_kernel_size in [3, 5]:
new = deepcopy(base_hyperparameters)
new["minibatch_size"] = minibatch_size
new["train_data"]["subset"] = train_subset
new["layer_sizes"] = [dense_layer_size]
new["dense_layer_l1_regularization"] = l1
(lc_layer,) = new["locally_connected_layers"]
lc_layer['kernel_size'] = lc_kernel_size
if num_lc == 0:
new["locally_connected_layers"] = []
elif num_lc == 1:
new["locally_connected_layers"] = [lc_layer]
elif num_lc == 2:
new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)]
if not grid or new not in grid:
grid.append(new)
dump(grid, stdout)
"""
Write and summarize model validation data, which is obtained by taking a full
dataset and removing the data used for training.
"""
import argparse
import sys
from os.path import abspath
import pandas
import numpy
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"--include",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to include")
parser.add_argument(
"--exclude",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to exclude")
parser.add_argument(
"--out-data",
metavar="RESULT.csv",
help="Output dadta CSV")
parser.add_argument(
"--out-summary",
metavar="RESULT.csv",
help="Output summary CSV")
parser.add_argument(
"--mass-spec-regex",
metavar="REGEX",
default="mass[- ]spec",
help="Regular expression for mass-spec data. Runs on measurement_source col."
"Default: %(default)s.")
parser.add_argument(
"--only-alleles-present-in-exclude",
action="store_true",
default=False,
help="Filter to only alleles that are present in files given by --exclude. "
"Useful for filtering to only alleles supported by a predictor, where the "
"training data for the predictor is given by --exclude.")
def run(argv):
args = parser.parse_args(argv)
dfs = []
for input in args.include:
df = pandas.read_csv(input)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Loaded data with shape: %s" % str(df.shape))
del dfs
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
if args.exclude:
exclude_dfs = []
for exclude in args.exclude:
exclude_df = pandas.read_csv(exclude)
exclude_dfs.append(exclude_df)
exclude_df = pandas.concat(exclude_dfs, ignore_index=True)
del exclude_dfs
df["_key"] = df.allele + "__" + df.peptide
exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
df["_excluded"] = df._key.isin(exclude_df._key.unique())
print("Excluding measurements per allele (counts): ")
print(df.groupby("allele")._excluded.sum())
print("Excluding measurements per allele (fractions): ")
print(df.groupby("allele")._excluded.mean())
df = df.loc[~df._excluded]
del df["_excluded"]
del df["_key"]
if args.only_alleles_present_in_exclude:
df = df.loc[df.allele.isin(exclude_df.allele.unique())]
df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex)
df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec"
if args.out_summary:
summary_df = df.groupby(
["allele", "measurement_inequality"]
)["measurement_value"].count().unstack().fillna(0).astype(int)
summary_df["total"] = summary_df.sum(1)
summary_df.to_csv(args.out_summary)
print("Wrote: %s" % args.out_summary)
if args.out_data:
df.to_csv(args.out_data, index=False)
print("Wrote: %s" % args.out_data)
if __name__ == '__main__':
run(sys.argv[1:])
......@@ -144,7 +144,7 @@ def run(argv=sys.argv[1:]):
configure_logging(verbose=args.verbosity > 1)
hyperparameters_lst = yaml.load(open(args.hyperparameters))
assert isinstance(hyperparameters_lst, list)
assert isinstance(hyperparameters_lst, list), hyperparameters_lst
print("Loaded hyperparameters list: %s" % str(hyperparameters_lst))
df = pandas.read_csv(args.data)
......@@ -171,8 +171,8 @@ def run(argv=sys.argv[1:]):
].index)
# Allele names in data are assumed to be already normalized.
print("Selected %d/%d alleles: %s" % (len(alleles), df.allele.nunique(), ' '.join(alleles)))
df = df.loc[df.allele.isin(alleles)].dropna()
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
if args.held_out_fraction_reciprocal:
df = subselect_df_held_out(
......@@ -227,6 +227,7 @@ def run(argv=sys.argv[1:]):
allele_sequences = allele_sequences.loc[
allele_sequences.index.isin(df.allele.unique())
]
print("Allele sequences matching train data: %d" % len(allele_sequences))
blosum_encoding = (
AlleleEncoding(
allele_sequences.index.values,
......@@ -377,7 +378,6 @@ def train_model(
alleles = []
while not alleles or data_size_by_allele.loc[alleles].sum() < pretrain_min_points:
alleles.append(similar_alleles.pop(0))
print(alleles)
data = data.loc[data.allele.isin(alleles)]
assert len(data) >= pretrain_min_points, (len(data), pretrain_min_points)
train_rounds = (data.allele == allele).astype(int).values
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment