Skip to content
Snippets Groups Projects
Commit 3ec5dbe1 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

updates

parent 057e272a
No related merge requests found
# Class I allele-specific models (ensemble)
This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
This download contains trained MHC Class I MHCflurry models.
The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
To generate this download run:
To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument.
To generate this download we run:
```
./GENERATE.sh \
--parallel-backend kubeface \
--target-tasks 200 \
--kubeface-backend kubernetes \
--kubeface-storage gs://kubeface-tim \
--kubeface-worker-image hammerlab/mhcflurry-misc:latest \
--kubeface-kubernetes-task-resources-memory-mb 10000 \
--kubeface-worker-path-prefix venv-py3/bin \
--kubeface-max-simultaneous-tasks 200 \
--kubeface-speculation-max-reruns 3 \
```
To debug locally:
```
./GENERATE.sh \
--parallel-backend local-threads \
--target-tasks 1
```
./GENERATE.sh
```
\ No newline at end of file
......@@ -47,7 +47,8 @@ class Class1AffinityPredictor(object):
manifest_df : pandas.DataFrame, optional
Must have columns: model_name, allele, config_json, model.
Only required if you want to update an existing serialization of a
Class1AffinityPredictor.
Class1AffinityPredictor. Otherwise this dataframe will be generated
automatically based on the supplied models.
"""
if allele_to_allele_specific_models is None:
......@@ -91,6 +92,12 @@ class Class1AffinityPredictor(object):
"""
Serialize the predictor to a directory on disk.
The serialization format consists of a file called "manifest.csv" with
the configurations of each Class1NeuralNetwork, along with per-network
files giving the model weights. If there are pan-allele predictors in
the ensemble, the allele pseudosequences are also stored in the
directory.
Parameters
----------
models_dir : string
......
"""
Train single allele models
Train Class1 single allele models.
"""
import sys
......@@ -8,7 +8,6 @@ import json
import pandas
from .class1_affinity_predictor import Class1AffinityPredictor
from ..common import configure_logging
......@@ -17,14 +16,19 @@ parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--data",
metavar="FILE.csv",
required=True,
help="Training data")
help=(
"Training data CSV. Expected columns: "
"allele, peptide, measurement_value"))
parser.add_argument(
"--out-models-dir",
metavar="DIR",
required=True,
help="Directory to write models and manifest")
parser.add_argument(
"--hyperparameters",
metavar="FILE.json",
required=True,
help="JSON of hyperparameters")
parser.add_argument(
......@@ -39,15 +43,10 @@ parser.add_argument(
metavar="N",
default=50,
help="Train models for alleles with >=N measurements.")
parser.add_argument(
"--only-quantitative",
action="store_true",
help="Exclude qualitative measurements",
default=False)
parser.add_argument(
"--verbosity",
type=int,
help="Default: %(default)s",
help="Keras verbosity. Default: %(default)s",
default=1)
......
......@@ -13,7 +13,6 @@
# limitations under the License.
from __future__ import print_function, division, absolute_import
from math import exp, log
import itertools
import collections
import logging
......@@ -28,73 +27,6 @@ import pandas
from . import amino_acid
class UnsupportedAllele(Exception):
pass
def parse_int_list(s):
return [int(part.strip()) for part in s.split(",")]
def split_uppercase_sequences(s):
return [part.strip().upper() for part in s.split(",")]
MHC_PREFIXES = [
"HLA",
"H-2",
"Mamu",
"Patr",
"Gogo",
"ELA",
]
def normalize_allele_name(allele_name, default_prefix="HLA"):
"""
Only works for a small number of species.
TODO: use the same logic as mhctools for MHC name parsing.
Possibly even worth its own small repo called something like "mhcnames"
"""
allele_name = allele_name.upper()
# old school HLA-C serotypes look like "Cw"
allele_name = allele_name.replace("CW", "C")
prefix = default_prefix
for candidate in MHC_PREFIXES:
if (allele_name.startswith(candidate.upper()) or
allele_name.startswith(candidate.replace("-", "").upper())):
prefix = candidate
allele_name = allele_name[len(prefix):]
break
for pattern in MHC_PREFIXES + ["-", "*", ":"]:
allele_name = allele_name.replace(pattern, "")
return "%s%s" % (prefix + "-" if prefix else "", allele_name)
def split_allele_names(s):
return [
normalize_allele_name(part.strip())
for part
in s.split(",")
]
def geometric_mean(xs, weights=None):
"""
Geometric mean of a collection of affinity measurements, with optional
sample weights.
"""
if len(xs) == 1:
return xs[0]
elif weights is None:
return exp(sum(log(xi) for xi in xs) / len(xs))
sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
denom = sum(weights)
return exp(sum_weighted_log / denom)
def all_combinations(**dict_of_lists):
"""
Iterator that generates all combinations of parameters given in the
......@@ -107,36 +39,6 @@ def all_combinations(**dict_of_lists):
yield dict(zip(arg_names, combination_of_values))
def groupby_indices(iterable, key_fn=lambda x: x):
"""
Returns dictionary mapping unique values to list of indices that
had those values.
"""
index_groups = collections.defaultdict(list)
for i, x in enumerate(key_fn(x) for x in iterable):
index_groups[x].append(i)
return index_groups
def shuffle_split_list(indices, fraction=0.5):
"""
Split a list of indices into two sub-lists, with an optional parameter
controlling what fraction of the indices go to the left list.
"""
indices = np.asarray(indices)
np.random.shuffle(indices)
n = len(indices)
left_count = int(np.ceil(fraction * n))
if n > 1 and left_count == 0:
left_count = 1
elif n > 1 and left_count == n:
left_count = n - 1
return indices[:left_count], indices[left_count:]
def dataframe_cryptographic_hash(df):
"""
Return a cryptographic (i.e. collisions extremely unlikely) hash
......@@ -244,18 +146,21 @@ def drop_nulls_and_warn(df, related_df_with_same_index_to_describe=None):
return new_df
def from_ic50(ic50):
x = 1.0 - (numpy.log(ic50) / numpy.log(50000))
return numpy.minimum(
1.0,
numpy.maximum(0.0, x))
def to_ic50(x):
return 50000.0 ** (1.0 - x)
def amino_acid_distribution(peptides, smoothing=0.0):
"""
Compute the fraction of each amino acid across a collection of peptides.
Parameters
----------
peptides : list of string
smoothing : float, optional
Small number (e.g. 0.01) to add to all amino acid fractions. The higher
the number the more uniform the distribution.
Returns
-------
pandas.Series indexed by amino acids
"""
peptides = pandas.Series(peptides)
aa_counts = pandas.Series(peptides.map(collections.Counter).sum())
normalized = aa_counts / aa_counts.sum()
......
......@@ -252,9 +252,14 @@ class EncodableSequences(object):
string of length max_length
"""
assert len(klass.unknown_character) == 1
assert len(sequence) >= left_edge + right_edge, sequence
assert len(sequence) <= max_length, sequence
if len(sequence) < left_edge + right_edge:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"least %d" % (sequence, len(sequence), left_edge + right_edge))
if len(sequence) > max_length:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"most %d" % (sequence, len(sequence), max_length))
middle_length = max_length - left_edge - right_edge
......
......@@ -41,10 +41,10 @@ from __future__ import (
)
import sys
import argparse
import logging
import pandas
import itertools
import pandas
from .downloads import get_path
from .class1_affinity_prediction import Class1AffinityPredictor
......@@ -99,7 +99,7 @@ parser.add_argument(
metavar="DIR",
default=None,
help="Directory containing models. "
"Default: %s" % get_path("models_class1", test_exists=False))
"Default: %s" % get_path("models_class1", "models", test_exists=False))
def run(argv=sys.argv[1:]):
......@@ -148,24 +148,11 @@ def run(argv=sys.argv[1:]):
# The reason we set the default here instead of in the argument parser is that
# we want to test_exists at this point, so the user gets a message instructing
# them to download the models if needed.
models_dir = get_path("models_class1")
models_dir = get_path("models_class1", "models")
predictor = Class1AffinityPredictor.load(models_dir)
predictions = {} # allele -> peptide -> value
for (allele, sub_df) in df.groupby(args.allele_column):
logging.info("Running %d predictions for allele %s" % (
len(sub_df), allele))
peptides = sub_df[args.peptide_column].values
predictions[allele] = dict(
(peptide, prediction)
for (peptide, prediction)
in zip(peptides, predictor.predict_for_allele(allele, peptides)))
logging.info("Collecting result")
df[args.prediction_column] = [
predictions[row[args.allele_column]][row[args.peptide_column]]
for (_, row) in df.iterrows()
]
df[args.prediction_column] = predictor.predict(
peptides=df[args.peptide_column].values,
alleles=df[args.allele_column].values)
if args.out:
df.to_csv(args.out, index=False)
......
......@@ -8,7 +8,7 @@ import sklearn
import numpy
import scipy
from mhcflurry.regression_target import ic50_to_regression_target
from .regression_target import from_ic50
def make_scores(
......@@ -39,7 +39,7 @@ def make_scores(
dict with entries "auc", "f1", "tau"
"""
y_pred = ic50_to_regression_target(ic50_y_pred, max_ic50)
y_pred = from_ic50(ic50_y_pred, max_ic50)
try:
auc = sklearn.metrics.roc_auc_score(
ic50_y <= threshold_nm,
......
......@@ -37,8 +37,8 @@ def test_csv():
def test_no_csv():
args = [
"--alleles", "HLA-A0201", "H-2Kb",
"--peptides", "SIINFEKL", "DENDREKLLL", "PICKLE",
"--alleles", "HLA-A0201", "H-2-Kb",
"--peptides", "SIINFEKL", "DENDREKLLL", "PICKLEEE",
"--prediction-column", "prediction",
]
......@@ -58,5 +58,5 @@ def test_no_csv():
assert_equal(result.shape, (6, 3))
sub_result1 = result.ix[result.peptide == "SIINFEKL"].set_index("allele")
assert (
sub_result1.ix["H-2Kb"].prediction <
sub_result1.ix["H-2-Kb"].prediction <
sub_result1.ix["HLA-A0201"].prediction)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment