Skip to content
Snippets Groups Projects
Commit 56f84119 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Begin rewrite

parent f8d13fc0
No related merge requests found
Showing
with 7 additions and 21159 deletions
# The combined training set
This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241).
The latest IEDB data is downloaded as part of generating this dataset. The Kim 2014 data is in its own MHCflurry download [here](../data_kim2014).
Since affinity is measured using a variety of assays, some of which are incompatible, the `create-combined-class1-dataset.py` script filters the available Class I binding assays in IEDB by only retaining those with high correlation to overlapping measurements in BD2013.
To generate this download run:
```
./GENERATE.sh
```
\ No newline at end of file
#!/usr/bin/env python
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries
* = "AffinityMeasurementDataset size and composition impact the reliability..."
"""
from __future__ import (
print_function,
division,
absolute_import,
unicode_literals
)
import pickle
from collections import Counter
import argparse
import pandas as pd
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--ic50-fraction-tolerance",
default=0.01,
type=float,
help=(
"How much can the IEDB and NetMHCpan IC50 differ and still be"
" considered compatible (as a fraction of the NetMHCpan value). "
"Default: %(default)s"))
parser.add_argument(
"--min-assay-overlap-size",
type=int,
default=1,
help="Minimum number of entries overlapping between IEDB assay and "
"NetMHCpan data. Default: %(default)s")
parser.add_argument(
"--min-assay-fraction-same",
type=float,
help="Minimum fraction of peptides whose IC50 values agree with the "
"NetMHCpan data. Default: %(default)s",
default=0.9)
parser.add_argument(
"--iedb-pickle-path",
required=True,
help="Path to .pickle file containing dictionary of IEDB assay datasets.")
parser.add_argument(
"--netmhcpan-csv-path",
required=True,
help="Path to CSV with NetMHCpan dataset from 2013 Peters paper.")
parser.add_argument(
"--output-csv-filename",
required=True,
help="Name of combined CSV file.")
parser.add_argument(
"--extra-dataset-csv-path",
default=[],
action="append",
help="Additional CSV data source with columns (species, mhc, peptide, meas)")
if __name__ == "__main__":
args = parser.parse_args()
print("Reading %s..." % args.iedb_pickle_path)
with open(args.iedb_pickle_path, "rb") as f:
iedb_datasets = pickle.load(f)
print("Reading %s..." % args.netmhcpan_csv_path)
nielsen_data = pd.read_csv(args.netmhcpan_csv_path, sep="\t")
print("Size of 2013 NetMHCpan dataset: %d" % len(nielsen_data))
new_allele_counts = Counter()
combined_columns = {
"species": list(nielsen_data["species"]),
"mhc": list(nielsen_data["mhc"]),
"peptide": list(nielsen_data["sequence"]),
"peptide_length": list(nielsen_data["peptide_length"]),
"meas": list(nielsen_data["meas"]),
}
all_datasets = {
path: pd.read_csv(path) for path in args.extra_dataset_csv_path
}
all_datasets.update(iedb_datasets)
for assay, assay_dataset in sorted(all_datasets.items(), key=lambda x: len(x[1])):
joined = nielsen_data.merge(
assay_dataset,
left_on=["mhc", "sequence"],
right_on=["mhc", "peptide"],
how="outer")
if len(joined) == 0:
continue
# drop NaN binding values and entries without values in both datasets
left_missing = joined["meas"].isnull()
right_missing = joined["value"].isnull()
overlap_filter_mask = ~(left_missing | right_missing)
filtered = joined[overlap_filter_mask]
n_overlap = len(filtered)
if n_overlap < args.min_assay_overlap_size:
continue
# let's count what fraction of this IEDB assay is within 1% of the values in the
# Nielsen dataset
tolerance = filtered["meas"] * args.ic50_fraction_tolerance
abs_diff = (filtered["value"] - filtered["meas"]).abs()
similar_values = abs_diff <= tolerance
fraction_similar = similar_values.mean()
print("Assay=%s, count=%d" % (assay, len(assay_dataset)))
print(" # entries w/ values in both data sets: %d" % n_overlap)
print(" fraction similar binding values=%0.4f" % fraction_similar)
new_peptides = joined[left_missing & ~right_missing]
if fraction_similar > args.min_assay_fraction_same:
print("---")
print("\t using assay: %s" % (assay,))
print("---")
combined_columns["mhc"].extend(new_peptides["mhc"])
combined_columns["peptide"].extend(new_peptides["peptide"])
combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len())
combined_columns["meas"].extend(new_peptides["value"])
# TODO: make this work for non-human data
combined_columns["species"].extend(["human"] * len(new_peptides))
for allele in new_peptides["mhc"]:
new_allele_counts[allele] += 1
combined_df = pd.DataFrame(
combined_columns,
columns=["species", "mhc", "peptide", "peptide_length", "meas"])
# filter out post-translation modifications and peptides with unknown
# residues
modified_peptide_mask = combined_df.peptide.str.contains("\+")
n_modified = modified_peptide_mask.sum()
if n_modified > 0:
print("Dropping %d modified peptides" % n_modified)
combined_df = combined_df[~modified_peptide_mask]
print("New entry allele distribution")
for (allele, count) in new_allele_counts.most_common():
print("%s: %d" % (allele, count))
print("Combined DataFrame size: %d (+%d)" % (
len(combined_df),
len(combined_df) - len(nielsen_data)))
print("Writing %s..." % args.output_csv_filename)
combined_df.to_csv(args.output_csv_filename, index=False)
#!/usr/bin/env python
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Turn a raw CSV snapshot of the IEDB contents into a usable
class I binding prediction dataset by grouping all unique pMHCs
"""
from collections import defaultdict
import pickle
import argparse
import numpy as np
import pandas as pd
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--input-csv",
required=True,
help="CSV file with IEDB's MHC binding data.")
parser.add_argument(
"--output-pickle-filename",
required=True,
help="Path to .pickle file containing dictionary of IEDB assay datasets.")
parser.add_argument(
"--alleles",
metavar="ALLELE",
nargs="+",
default=[],
help="Restrict dataset to specified alleles")
def filter_class1_alleles(df):
mhc_class = df["MHC"]["MHC allele class"]
print("MHC class counts: \n%s" % (mhc_class.value_counts(),))
class1_mask = mhc_class == "I"
return df[class1_mask]
def filter_allele_names(df):
alleles = df["MHC"]["Allele Name"]
invalid_allele_mask = alleles.str.contains(" ") | alleles.str.contains("/")
invalid_alleles = alleles[invalid_allele_mask]
print("-- Invalid allele names: %s" % (list(sorted(set(invalid_alleles)))))
print("Dropping %d with complex alleles (e.g. descriptions of mutations)" %
len(invalid_alleles))
return df[~invalid_allele_mask]
def filter_affinity_values(df):
affinities = df["Assay"]["Quantitative measurement"]
finite_affinity_mask = ~affinities.isnull() & np.isfinite(affinities)
invalid_affinity_mask = ~finite_affinity_mask
print("Dropping %d rows without finite affinity measurements" % (
invalid_affinity_mask.sum(),))
return df[finite_affinity_mask]
def filter_mhc_dataframe(df):
filter_functions = [
filter_class1_alleles,
filter_allele_names,
filter_affinity_values,
]
for fn in filter_functions:
df = fn(df)
return df
def groupby_assay(df):
assay_group = df["Assay"]["Assay Group"]
assay_method = df["Assay"]["Method/Technique"]
groups = df.groupby([assay_group, assay_method])
# speed up repeated calls to np.log by caching log affinities as a column
# in the dataframe
df["_log_affinity"] = np.log(df["Assay"]["Quantitative measurement"])
# speed up computing percent positive with the helper column
qualitative = df["Assay"]["Qualitative Measure"]
df["_qualitative_positive"] = qualitative.str.startswith("Positive")
print("---")
print("Assays")
assay_dataframes = {}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for (assay_group, assay_method), group_data in sorted(
groups,
key=lambda x: len(x[1]),
reverse=True):
print("- %s (%s): %d" % (assay_group, assay_method, len(group_data)))
group_alleles = group_data["MHC"]["Allele Name"]
group_peptides = group_data["Epitope"]["Description"]
distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
columns = defaultdict(list)
for (allele, peptide), pmhc_group in distinct_pmhc:
columns["mhc"].append(allele)
columns["peptide"].append(peptide)
positive = pmhc_group["_qualitative_positive"]
count = len(pmhc_group)
if count == 1:
ic50 = pmhc_group["Assay"]["Quantitative measurement"].mean()
else:
ic50 = np.exp(np.mean(pmhc_group["_log_affinity"]))
# averaging the log affinities preserves orders of magnitude better
columns["value"].append(ic50)
columns["percent_positive"].append(positive.mean())
columns["count"].append(count)
assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
columns,
columns=[
"mhc",
"peptide",
"value",
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
return assay_dataframes
if __name__ == "__main__":
args = parser.parse_args()
df = pd.read_csv(
args.input_csv,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
df = filter_mhc_dataframe(df)
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# Class I rows: %d" % n)
print("# Class I alleles: %d" % len(set(alleles)))
print("Unique alleles: %s" % list(sorted(set(alleles))))
if args.alleles:
print("User-supplied allele whitelist: %s" % (args.alleles,))
mask = np.zeros(n, dtype=bool)
for pattern in args.alleles:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching alleles %s: %d" % (
args.alleles,
len(df)))
assay_dataframes = groupby_assay(df)
with open(args.output_pickle_filename, "wb") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
......@@ -3,10 +3,9 @@
set -e
set -x
DOWNLOAD_NAME=data_combined_iedb_kim2014
DOWNLOAD_NAME=data_iedb
SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
......@@ -18,28 +17,13 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git rev-parse HEAD
git status
cd "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir .tmp # By starting with a dot, we won't include it in the tar archive
cd .tmp
cd $SCRATCH_DIR/$DOWNLOAD_NAME
wget --quiet http://www.iedb.org/doc/mhc_ligand_full.zip
unzip mhc_ligand_full.zip
rm mhc_ligand_full.zip
$SCRIPT_DIR/create-iedb-class1-dataset.py \
--input-csv mhc_ligand_full.csv \
--output-pickle-filename iedb_human_class1_assay_datasets.pickle
$SCRIPT_DIR/create-combined-class1-dataset.py \
--iedb-pickle-path iedb_human_class1_assay_datasets.pickle \
--netmhcpan-csv-path "$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \
--output-csv-filename ../combined_human_class1_dataset.csv
cd ..
cp $SCRIPT_ABSOLUTE_PATH .
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
......
#!/bin/bash
if [[ $# -eq 0 ]] ; then
echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
echo 'See README.md'
fi
set -e
set -x
DOWNLOAD_NAME=models_class1_allele_specific_ensemble
SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git rev-parse HEAD
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models
cp $SCRIPT_DIR/models.py .
python models.py > models.json
time mhcflurry-class1-allele-specific-ensemble-train \
--ensemble-size 16 \
--model-architectures models.json \
--train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
--min-samples-per-allele 20 \
--out-manifest selected_models.csv \
--out-model-selection-manifest all_models.csv \
--out-models models \
--verbose \
"$@"
bzip2 all_models.csv
cp $SCRIPT_ABSOLUTE_PATH .
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Class I allele-specific models (ensemble)
This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument.
To generate this download we run:
```
./GENERATE.sh \
--parallel-backend kubeface \
--target-tasks 200 \
--kubeface-backend kubernetes \
--kubeface-storage gs://kubeface-tim \
--kubeface-worker-image hammerlab/mhcflurry-misc:latest \
--kubeface-kubernetes-task-resources-memory-mb 10000 \
--kubeface-worker-path-prefix venv-py3/bin \
--kubeface-max-simultaneous-tasks 200 \
--kubeface-speculation-max-reruns 3 \
```
To debug locally:
```
./GENERATE.sh \
--parallel-backend local-threads \
--target-tasks 1
```
# Class1 allele-specific ensemble models
To generate the report, run:
```
time jupyter-nbconvert report.ipynb \
--execute \
--ExecutePreprocessor.kernel_name=python \
--ExecutePreprocessor.timeout=60 \
--to html \
--stdout > report.html
```
source diff could not be displayed: it is too large. Options to address this: view the blob.
import sys
from mhcflurry.class1_allele_specific_ensemble import HYPERPARAMETER_DEFAULTS
import json
models = HYPERPARAMETER_DEFAULTS.models_grid(
impute=[False, True],
activation=["tanh"],
layer_sizes=[[12], [64], [128]],
embedding_output_dim=[8, 32, 64],
dropout_probability=[0, .1, .25],
fraction_negative=[0, .1, .2],
n_training_epochs=[250],
# Imputation arguments
impute_method=["mice"],
imputer_args=[
# Arguments specific to imputation method (mice)
{"n_burn_in": 5, "n_imputations": 50, "n_nearest_columns": 25}
],
impute_min_observations_per_peptide=[3],
impute_min_observations_per_allele=[3])
sys.stderr.write("Models: %d\n" % len(models))
print(json.dumps(models, indent=4))
#!/bin/bash
if [[ $# -eq 0 ]] ; then
echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
echo 'At minimum you probably want to pass --dask-scheduler <IP:PORT> as training many models on one node is extremely '
echo 'slow.'
fi
set -e
set -x
DOWNLOAD_NAME=models_class1_allele_specific_single
SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
# pip freeze
# git rev-parse HEAD
# git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models
cp $SCRIPT_DIR/models.py $SCRIPT_DIR/imputer.json .
python models.py > models.json
time mhcflurry-class1-allele-specific-cv-and-train \
--model-architectures models.json \
--imputer-description imputer.json \
--train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
--min-samples-per-allele 200 \
--out-cv-results cv.csv \
--out-production-results production.csv \
--out-models models \
--verbose \
"$@"
cp $SCRIPT_ABSOLUTE_PATH .
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Class I allele-specific models (single)
This download contains trained MHC Class I allele-specific MHCflurry models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download. We first select network hyperparameters for each allele individually using cross validation over the models enumerated in [models.py](models.py). The best hyperparameter settings are selected via average of AUC (at 500nm), F1, and Kendall's Tau over the training folds. We then train the production models over the full training set using the selected hyperparameters.
The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) librarie.
To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument.
To generate this download we run:
```
./GENERATE.sh \
--cv-folds-per-task 10 \
--backend kubernetes \
--storage-prefix gs://kubeface \
--worker-image hammerlab/mhcflurry:latest \
--kubernetes-task-resources-memory-mb 10000 \
--worker-path-prefix venv-py3/bin \
--max-simultaneous-tasks 200 \
```
{
"imputation_method_name": "mice",
"n_burn_in": 5,
"n_imputations": 50,
"n_nearest_columns": 25,
"min_observations_per_peptide": 5,
"min_observations_per_allele": 100
}
import sys
from mhcflurry.class1_allele_specific.train import HYPERPARAMETER_DEFAULTS
import json
models = HYPERPARAMETER_DEFAULTS.models_grid(
impute=[False, True],
activation=["tanh"],
layer_sizes=[[12], [64], [128]],
embedding_output_dim=[8, 32, 64],
dropout_probability=[0, .1, .25],
fraction_negative=[0, .1, .2],
n_training_epochs=[250])
sys.stderr.write("Models: %d\n" % len(models))
print(json.dumps(models, indent=4))
#!/bin/bash
if [[ $# -eq 0 ]] ; then
echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
echo 'At minimum you probably want to pass --dask-scheduler <IP:PORT> as training many models on one node is extremely '
echo 'slow.'
fi
set -e
set -x
DOWNLOAD_NAME=models_class1_allele_specific_single_kim2014_only
SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git rev-parse HEAD
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models
cp $SCRIPT_DIR/models.py $SCRIPT_DIR/imputer.json .
python models.py > models.json
time mhcflurry-class1-allele-specific-cv-and-train \
--model-architectures models.json \
--imputer-description imputer.json \
--train-data "$(mhcflurry-downloads path data_kim2014)/bdata.2009.mhci.public.1.txt" \
--test-data "$(mhcflurry-downloads path data_kim2014)/bdata.2013.mhci.public.blind.1.txt" \
--min-samples-per-allele 50 \
--out-cv-results cv.csv \
--out-production-results production.csv \
--out-models models \
--verbose \
"$@"
cp $SCRIPT_ABSOLUTE_PATH .
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Class I allele specific models (single) trained and tested in Kim 2014 dataset
This is a reimplementation of the analysis in [Predicting Peptide-MHC Binding Affinities With Imputed Training Data](http://biorxiv.org/content/early/2016/05/22/054775).
{
"imputation_method_name": "mice",
"n_burn_in": 5,
"n_imputations": 50,
"n_nearest_columns": 25,
"min_observations_per_peptide": 2,
"min_observations_per_allele": 2
}
import sys
from mhcflurry.class1_allele_specific.train import HYPERPARAMETER_DEFAULTS
import json
models = HYPERPARAMETER_DEFAULTS.models_grid(
#impute=[False, True],
impute=[False],
activation=["tanh"],
layer_sizes=[[12], [64], [128]],
embedding_output_dim=[8, 32, 64],
dropout_probability=[0, .1, .25],
# fraction_negative=[0, .1, .2],
n_training_epochs=[250])
sys.stderr.write("Models: %d\n" % len(models))
print(json.dumps(models, indent=4))
......@@ -12,24 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .class1_allele_specific.class1_binding_predictor import (
from .class1_affinity_prediction.class1_binding_predictor import (
Class1BindingPredictor)
from .prediction import predict
from .affinity_measurement_dataset import AffinityMeasurementDataset
from .class1_allele_specific_ensemble import Class1EnsembleMultiAllelePredictor
from .class1_allele_specific import Class1SingleModelMultiAllelePredictor
from .measurement_collection import MeasurementCollection
from . import parallelism
from .class1_affinity_prediction.multi_allele_predictor_ensemble import (
MultiAllelePredictorEnsemble)
__version__ = "0.2.0"
__all__ = [
"Class1BindingPredictor",
"predict",
"parallelism",
"AffinityMeasurementDataset",
"Class1EnsembleMultiAllelePredictor",
"Class1SingleModelMultiAllelePredictor",
"MeasurementCollection",
"MultiAllelePredictorEnsemble",
"__version__",
]
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment