Begin rewrite

56f84119 · Tim O'Donnell · f8d13fc0 · f8d13fc0 · f8d13fc0 · f8d13fc0
Commit 56f84119 authored 7 years ago by Tim O'Donnell
--- a/downloads-generation/data_combined_iedb_kim2014/README.md
+++ b/downloads-generation/data_combined_iedb_kim2014/README.md
-# The combined training set
-
-This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). 
-
-The latest IEDB data is downloaded as part of generating this dataset. The Kim 2014 data is in its own MHCflurry download [here](../data_kim2014). 
-
-Since affinity is measured using a variety of assays, some of which are incompatible, the `create-combined-class1-dataset.py` script filters the available Class I binding assays in IEDB by only retaining those with high correlation to overlapping measurements in BD2013. 
-
-To generate this download run:
-
-```
-./GENERATE.sh
-```
\ No newline at end of file
--- a/downloads-generation/data_combined_iedb_kim2014/create-combined-class1-dataset.py
+++ b/downloads-generation/data_combined_iedb_kim2014/create-combined-class1-dataset.py
-#!/usr/bin/env python
-
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries
-
-* = "AffinityMeasurementDataset size and composition impact the reliability..."
-"""
- 
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-    unicode_literals
-)
-import pickle
-from collections import Counter
-import argparse
-
-import pandas as pd
-
-parser = argparse.ArgumentParser(usage=__doc__)
-
-parser.add_argument(
-    "--ic50-fraction-tolerance",
-    default=0.01,
-    type=float,
-    help=(
-        "How much can the IEDB and NetMHCpan IC50 differ and still be"
-        " considered compatible (as a fraction of the NetMHCpan value). "
-        "Default: %(default)s"))
-
-parser.add_argument(
-    "--min-assay-overlap-size",
-    type=int,
-    default=1,
-    help="Minimum number of entries overlapping between IEDB assay and "
-    "NetMHCpan data. Default: %(default)s")
-
-
-parser.add_argument(
-    "--min-assay-fraction-same",
-    type=float,
-    help="Minimum fraction of peptides whose IC50 values agree with the "
-    "NetMHCpan data. Default: %(default)s",
-    default=0.9)
-
-parser.add_argument(
-    "--iedb-pickle-path",
-    required=True,
-    help="Path to .pickle file containing dictionary of IEDB assay datasets.")
-
-parser.add_argument(
-    "--netmhcpan-csv-path",
-    required=True,
-    help="Path to CSV with NetMHCpan dataset from 2013 Peters paper.")
-
-parser.add_argument(
-    "--output-csv-filename",
-    required=True,
-    help="Name of combined CSV file.")
-
-parser.add_argument(
-    "--extra-dataset-csv-path",
-    default=[],
-    action="append",
-    help="Additional CSV data source with columns (species, mhc, peptide, meas)")
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    print("Reading %s..." % args.iedb_pickle_path)
-    with open(args.iedb_pickle_path, "rb") as f:
-        iedb_datasets = pickle.load(f)
-
-    print("Reading %s..." % args.netmhcpan_csv_path)
-    nielsen_data = pd.read_csv(args.netmhcpan_csv_path, sep="\t")
-    print("Size of 2013 NetMHCpan dataset: %d" % len(nielsen_data))
-
-    new_allele_counts = Counter()
-    combined_columns = {
-        "species": list(nielsen_data["species"]),
-        "mhc": list(nielsen_data["mhc"]),
-        "peptide": list(nielsen_data["sequence"]),
-        "peptide_length": list(nielsen_data["peptide_length"]),
-        "meas": list(nielsen_data["meas"]),
-    }
-
-    all_datasets = {
-        path: pd.read_csv(path) for path in args.extra_dataset_csv_path
-    }
-    all_datasets.update(iedb_datasets)
-    for assay, assay_dataset in sorted(all_datasets.items(), key=lambda x: len(x[1])):
-        joined = nielsen_data.merge(
-            assay_dataset,
-            left_on=["mhc", "sequence"],
-            right_on=["mhc", "peptide"],
-            how="outer")
-
-        if len(joined) == 0:
-            continue
-
-        # drop NaN binding values and entries without values in both datasets
-        left_missing = joined["meas"].isnull()
-        right_missing = joined["value"].isnull()
-        overlap_filter_mask = ~(left_missing | right_missing)
-        filtered = joined[overlap_filter_mask]
-        n_overlap = len(filtered)
-
-        if n_overlap < args.min_assay_overlap_size:
-            continue
-        # let's count what fraction of this IEDB assay is within 1% of the values in the
-        # Nielsen dataset
-        tolerance = filtered["meas"] * args.ic50_fraction_tolerance
-        abs_diff = (filtered["value"] - filtered["meas"]).abs()
-        similar_values = abs_diff <= tolerance
-        fraction_similar = similar_values.mean()
-        print("Assay=%s, count=%d" % (assay, len(assay_dataset)))
-        print("  # entries w/ values in both data sets: %d" % n_overlap)
-        print("  fraction similar binding values=%0.4f" % fraction_similar)
-        new_peptides = joined[left_missing & ~right_missing]
-        if fraction_similar > args.min_assay_fraction_same:
-            print("---")
-            print("\t using assay: %s" % (assay,))
-            print("---")
-            combined_columns["mhc"].extend(new_peptides["mhc"])
-            combined_columns["peptide"].extend(new_peptides["peptide"])
-            combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len())
-            combined_columns["meas"].extend(new_peptides["value"])
-            # TODO: make this work for non-human data
-            combined_columns["species"].extend(["human"] * len(new_peptides))
-            for allele in new_peptides["mhc"]:
-                new_allele_counts[allele] += 1
-
-    combined_df = pd.DataFrame(
-        combined_columns,
-        columns=["species", "mhc", "peptide", "peptide_length", "meas"])
-
-    # filter out post-translation modifications and peptides with unknown
-    # residues
-    modified_peptide_mask = combined_df.peptide.str.contains("\+")
-    n_modified = modified_peptide_mask.sum()
-    if n_modified > 0:
-        print("Dropping %d modified peptides" % n_modified)
-        combined_df = combined_df[~modified_peptide_mask]
-
-    print("New entry allele distribution")
-    for (allele, count) in new_allele_counts.most_common():
-        print("%s: %d" % (allele, count))
-    print("Combined DataFrame size: %d (+%d)" % (
-        len(combined_df),
-        len(combined_df) - len(nielsen_data)))
-    print("Writing %s..." % args.output_csv_filename)
-    combined_df.to_csv(args.output_csv_filename, index=False)
--- a/downloads-generation/data_combined_iedb_kim2014/create-iedb-class1-dataset.py
+++ b/downloads-generation/data_combined_iedb_kim2014/create-iedb-class1-dataset.py
-#!/usr/bin/env python
-
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Turn a raw CSV snapshot of the IEDB contents into a usable
-class I binding prediction dataset by grouping all unique pMHCs
-"""
-from collections import defaultdict
-import pickle
-import argparse
-
-import numpy as np
-import pandas as pd
-
-parser = argparse.ArgumentParser(usage=__doc__)
-
-parser.add_argument(
-    "--input-csv",
-    required=True,
-    help="CSV file with IEDB's MHC binding data.")
-
-parser.add_argument(
-    "--output-pickle-filename",
-    required=True,
-    help="Path to .pickle file containing dictionary of IEDB assay datasets.")
-
-parser.add_argument(
-    "--alleles",
-    metavar="ALLELE",
-    nargs="+",
-    default=[],
-    help="Restrict dataset to specified alleles")
-
-
-def filter_class1_alleles(df):
-    mhc_class = df["MHC"]["MHC allele class"]
-    print("MHC class counts: \n%s" % (mhc_class.value_counts(),))
-    class1_mask = mhc_class == "I"
-    return df[class1_mask]
-
-
-def filter_allele_names(df):
-    alleles = df["MHC"]["Allele Name"]
-    invalid_allele_mask = alleles.str.contains(" ") | alleles.str.contains("/")
-    invalid_alleles = alleles[invalid_allele_mask]
-    print("-- Invalid allele names: %s" % (list(sorted(set(invalid_alleles)))))
-    print("Dropping %d with complex alleles (e.g. descriptions of mutations)" %
-          len(invalid_alleles))
-    return df[~invalid_allele_mask]
-
-
-def filter_affinity_values(df):
-    affinities = df["Assay"]["Quantitative measurement"]
-    finite_affinity_mask = ~affinities.isnull() & np.isfinite(affinities)
-    invalid_affinity_mask = ~finite_affinity_mask
-
-    print("Dropping %d rows without finite affinity measurements" % (
-        invalid_affinity_mask.sum(),))
-    return df[finite_affinity_mask]
-
-
-def filter_mhc_dataframe(df):
-    filter_functions = [
-        filter_class1_alleles,
-        filter_allele_names,
-        filter_affinity_values,
-    ]
-
-    for fn in filter_functions:
-        df = fn(df)
-
-    return df
-
-
-def groupby_assay(df):
-    assay_group = df["Assay"]["Assay Group"]
-    assay_method = df["Assay"]["Method/Technique"]
-    groups = df.groupby([assay_group, assay_method])
-
-    # speed up repeated calls to np.log by caching log affinities as a column
-    # in the dataframe
-    df["_log_affinity"] = np.log(df["Assay"]["Quantitative measurement"])
-
-    # speed up computing percent positive with the helper column
-    qualitative = df["Assay"]["Qualitative Measure"]
-    df["_qualitative_positive"] = qualitative.str.startswith("Positive")
-    print("---")
-    print("Assays")
-    assay_dataframes = {}
-    # create a dataframe for every distinct kind of assay which is used
-    # by IEDB submitters to measure peptide-MHC affinity or stability
-    for (assay_group, assay_method), group_data in sorted(
-            groups,
-            key=lambda x: len(x[1]),
-            reverse=True):
-        print("- %s (%s): %d" % (assay_group, assay_method, len(group_data)))
-        group_alleles = group_data["MHC"]["Allele Name"]
-        group_peptides = group_data["Epitope"]["Description"]
-        distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
-        columns = defaultdict(list)
-        for (allele, peptide), pmhc_group in distinct_pmhc:
-            columns["mhc"].append(allele)
-            columns["peptide"].append(peptide)
-            positive = pmhc_group["_qualitative_positive"]
-            count = len(pmhc_group)
-            if count == 1:
-                ic50 = pmhc_group["Assay"]["Quantitative measurement"].mean()
-            else:
-                ic50 = np.exp(np.mean(pmhc_group["_log_affinity"]))
-            # averaging the log affinities preserves orders of magnitude better
-            columns["value"].append(ic50)
-            columns["percent_positive"].append(positive.mean())
-            columns["count"].append(count)
-        assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
-            columns,
-            columns=[
-                "mhc",
-                "peptide",
-                "value",
-                "percent_positive",
-                "count"])
-        print("# distinct pMHC entries: %d" % len(columns["mhc"]))
-    return assay_dataframes
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    df = pd.read_csv(
-        args.input_csv,
-        error_bad_lines=False,
-        encoding="latin-1",
-        header=[0, 1])
-
-    df = filter_mhc_dataframe(df)
-
-    alleles = df["MHC"]["Allele Name"]
-
-    n = len(alleles)
-
-    print("# Class I rows: %d" % n)
-    print("# Class I alleles: %d" % len(set(alleles)))
-    print("Unique alleles: %s" % list(sorted(set(alleles))))
-
-    if args.alleles:
-        print("User-supplied allele whitelist: %s" % (args.alleles,))
-        mask = np.zeros(n, dtype=bool)
-        for pattern in args.alleles:
-            pattern_mask = alleles.str.startswith(pattern)
-            print("# %s: %d" % (pattern, pattern_mask.sum()))
-            mask |= pattern_mask
-        df = df[mask]
-        print("# entries matching alleles %s: %d" % (
-            args.alleles,
-            len(df)))
-
-    assay_dataframes = groupby_assay(df)
-
-    with open(args.output_pickle_filename, "wb") as f:
-        pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
--- a/downloads-generation/data_combined_iedb_kim2014/GENERATE.sh
+++ b/downloads-generation/data_combined_iedb_kim2014/GENERATE.sh
@@ -3,10 +3,9 @@
 set -e
 set -x

-DOWNLOAD_NAME=data_combined_iedb_kim2014
+DOWNLOAD_NAME=data_iedb
 SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
 SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")

 mkdir -p "$SCRATCH_DIR"
 rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
@@ -18,28 +17,13 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)

 # Log some environment info
 date
-pip freeze
-git rev-parse HEAD
-git status

-cd "$SCRATCH_DIR/$DOWNLOAD_NAME"
-
-mkdir .tmp  # By starting with a dot, we won't include it in the tar archive
-cd .tmp
+cd $SCRATCH_DIR/$DOWNLOAD_NAME

 wget --quiet http://www.iedb.org/doc/mhc_ligand_full.zip
 unzip mhc_ligand_full.zip
+rm mhc_ligand_full.zip

-$SCRIPT_DIR/create-iedb-class1-dataset.py \
-    --input-csv mhc_ligand_full.csv \
-    --output-pickle-filename iedb_human_class1_assay_datasets.pickle
-
-$SCRIPT_DIR/create-combined-class1-dataset.py \
-    --iedb-pickle-path iedb_human_class1_assay_datasets.pickle \
-    --netmhcpan-csv-path "$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \
-    --output-csv-filename ../combined_human_class1_dataset.csv
-
-cd ..
 cp $SCRIPT_ABSOLUTE_PATH .
 tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *


--- a/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_ensemble/GENERATE.sh
-#!/bin/bash
-
-if [[ $# -eq 0 ]] ; then
-    echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
-    echo 'See README.md'
-fi
-
-set -e
-set -x
-
-DOWNLOAD_NAME=models_class1_allele_specific_ensemble
-SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
-SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
-export PYTHONUNBUFFERED=1
-
-mkdir -p "$SCRATCH_DIR"
-rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
-
-# Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
-
-# Log some environment info
-date
-pip freeze
-git rev-parse HEAD
-git status
-
-cd $SCRATCH_DIR/$DOWNLOAD_NAME
-
-mkdir models
-
-cp $SCRIPT_DIR/models.py .
-python models.py > models.json
-
-time mhcflurry-class1-allele-specific-ensemble-train \
-    --ensemble-size 16 \
-    --model-architectures models.json \
-    --train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
-    --min-samples-per-allele 20 \
-    --out-manifest selected_models.csv \
-    --out-model-selection-manifest all_models.csv \
-    --out-models models \
-    --verbose \
-    "$@"
-
-bzip2 all_models.csv
-cp $SCRIPT_ABSOLUTE_PATH .
-tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
-
-echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/models_class1_allele_specific_ensemble/README.md
+++ b/downloads-generation/models_class1_allele_specific_ensemble/README.md
-# Class I allele-specific models (ensemble)
-
-This download contains trained MHC Class I allele-specific MHCflurry models. For each allele, an ensemble of predictors is trained on random halves of the training data. Model architectures are selected based on performance on the other half of the dataset, so in general each ensemble contains predictors of different architectures. At prediction time the geometric mean IC50 is taken over the trained models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download.
-
-The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) library.
-
-To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument. 
-
-To generate this download we run:
-
-```
-./GENERATE.sh \
-    --parallel-backend kubeface \
-    --target-tasks 200 \
-    --kubeface-backend kubernetes \
-    --kubeface-storage gs://kubeface-tim \
-    --kubeface-worker-image hammerlab/mhcflurry-misc:latest \
-    --kubeface-kubernetes-task-resources-memory-mb 10000 \
-    --kubeface-worker-path-prefix venv-py3/bin \
-    --kubeface-max-simultaneous-tasks 200 \
-    --kubeface-speculation-max-reruns 3 \
-```
-
-To debug locally:
-```
-./GENERATE.sh \
-    --parallel-backend local-threads \
-    --target-tasks 1
-```
--- a/downloads-generation/models_class1_allele_specific_ensemble/models-summary/README.md
+++ b/downloads-generation/models_class1_allele_specific_ensemble/models-summary/README.md
-# Class1 allele-specific ensemble models
-
-To generate the report, run:
-
-```
-time jupyter-nbconvert report.ipynb \
-    --execute \
-    --ExecutePreprocessor.kernel_name=python \
-    --ExecutePreprocessor.timeout=60 \
-    --to html \
-    --stdout > report.html
-```
--- a/downloads-generation/models_class1_allele_specific_ensemble/models-summary/report.html
+++ b/downloads-generation/models_class1_allele_specific_ensemble/models-summary/report.html
--- a/downloads-generation/models_class1_allele_specific_ensemble/models-summary/report.ipynb
+++ b/downloads-generation/models_class1_allele_specific_ensemble/models-summary/report.ipynb
--- a/downloads-generation/models_class1_allele_specific_ensemble/models.py
+++ b/downloads-generation/models_class1_allele_specific_ensemble/models.py
-import sys
-from mhcflurry.class1_allele_specific_ensemble import HYPERPARAMETER_DEFAULTS
-import json
-
-models = HYPERPARAMETER_DEFAULTS.models_grid(
-    impute=[False, True],
-    activation=["tanh"],
-    layer_sizes=[[12], [64], [128]],
-    embedding_output_dim=[8, 32, 64],
-    dropout_probability=[0, .1, .25],
-    fraction_negative=[0, .1, .2],
-    n_training_epochs=[250],
-
-    # Imputation arguments
-    impute_method=["mice"],
-    imputer_args=[
-        # Arguments specific to imputation method (mice)
-        {"n_burn_in": 5, "n_imputations": 50, "n_nearest_columns": 25}
-    ],
-    impute_min_observations_per_peptide=[3],
-    impute_min_observations_per_allele=[3])
-
-sys.stderr.write("Models: %d\n" % len(models))
-print(json.dumps(models, indent=4))
--- a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
-#!/bin/bash
-
-if [[ $# -eq 0 ]] ; then
-    echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
-    echo 'At minimum you probably want to pass --dask-scheduler <IP:PORT> as training many models on one node is extremely '
-    echo 'slow.'
-fi
-
-set -e
-set -x
-
-DOWNLOAD_NAME=models_class1_allele_specific_single
-SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
-SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
-export PYTHONUNBUFFERED=1
-
-mkdir -p "$SCRATCH_DIR"
-rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
-
-# Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
-
-# Log some environment info
-date
-# pip freeze
-# git rev-parse HEAD
-# git status
-
-cd $SCRATCH_DIR/$DOWNLOAD_NAME
-
-mkdir models
-
-cp $SCRIPT_DIR/models.py $SCRIPT_DIR/imputer.json .
-python models.py > models.json
-
-time mhcflurry-class1-allele-specific-cv-and-train \
-    --model-architectures models.json \
-    --imputer-description imputer.json \
-    --train-data "$(mhcflurry-downloads path data_combined_iedb_kim2014)/combined_human_class1_dataset.csv" \
-    --min-samples-per-allele 200 \
-    --out-cv-results cv.csv \
-    --out-production-results production.csv \
-    --out-models models \
-    --verbose \
-    "$@"
-
-cp $SCRIPT_ABSOLUTE_PATH .
-tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
-
-echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/models_class1_allele_specific_single/README.md
+++ b/downloads-generation/models_class1_allele_specific_single/README.md
-# Class I allele-specific models (single)
-
-This download contains trained MHC Class I allele-specific MHCflurry models. The training data used is in the [data_combined_iedb_kim2014](../data_combined_iedb_kim2014) MHCflurry download. We first select network hyperparameters for each allele individually using cross validation over the models enumerated in [models.py](models.py). The best hyperparameter settings are selected via average of AUC (at 500nm), F1, and Kendall's Tau over the training folds. We then train the production models over the full training set using the selected hyperparameters.
-
-The training script supports multi-node parallel execution using the [kubeface](https://github.com/hammerlab/kubeface) librarie.
-
-To use kubeface, you should make a google storage bucket and pass it below with the --storage-prefix argument. 
-
-To generate this download we run:
-
-```
-./GENERATE.sh \
-    --cv-folds-per-task 10 \
-    --backend kubernetes \
-    --storage-prefix gs://kubeface \
-    --worker-image hammerlab/mhcflurry:latest \
-    --kubernetes-task-resources-memory-mb 10000 \
-    --worker-path-prefix venv-py3/bin \
-    --max-simultaneous-tasks 200 \
-
-```
--- a/downloads-generation/models_class1_allele_specific_single/imputer.json
+++ b/downloads-generation/models_class1_allele_specific_single/imputer.json
-{
-    "imputation_method_name": "mice",
-    "n_burn_in": 5,
-    "n_imputations": 50,
-    "n_nearest_columns": 25,
-    "min_observations_per_peptide": 5,
-    "min_observations_per_allele": 100 
-}
--- a/downloads-generation/models_class1_allele_specific_single/models.py
+++ b/downloads-generation/models_class1_allele_specific_single/models.py
-import sys
-from mhcflurry.class1_allele_specific.train import HYPERPARAMETER_DEFAULTS
-import json
-
-models = HYPERPARAMETER_DEFAULTS.models_grid(
-    impute=[False, True],
-    activation=["tanh"],
-    layer_sizes=[[12], [64], [128]],
-    embedding_output_dim=[8, 32, 64],
-    dropout_probability=[0, .1, .25],
-    fraction_negative=[0, .1, .2],
-    n_training_epochs=[250])
-
-sys.stderr.write("Models: %d\n" % len(models))
-print(json.dumps(models, indent=4))
--- a/downloads-generation/models_class1_allele_specific_single_kim2014_only/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_single_kim2014_only/GENERATE.sh
-#!/bin/bash
-
-if [[ $# -eq 0 ]] ; then
-    echo 'WARNING: This script is intended to be called with additional arguments to pass to mhcflurry-class1-allele-specific-cv-and-train'
-    echo 'At minimum you probably want to pass --dask-scheduler <IP:PORT> as training many models on one node is extremely '
-    echo 'slow.'
-fi
-
-set -e
-set -x
-
-DOWNLOAD_NAME=models_class1_allele_specific_single_kim2014_only
-SCRATCH_DIR=/tmp/mhcflurry-downloads-generation
-SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
-export PYTHONUNBUFFERED=1
-
-mkdir -p "$SCRATCH_DIR"
-rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
-
-# Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
-
-# Log some environment info
-date
-pip freeze
-git rev-parse HEAD
-git status
-
-cd $SCRATCH_DIR/$DOWNLOAD_NAME
-
-mkdir models
-
-cp $SCRIPT_DIR/models.py $SCRIPT_DIR/imputer.json .
-python models.py > models.json
-
-time mhcflurry-class1-allele-specific-cv-and-train \
-    --model-architectures models.json \
-    --imputer-description imputer.json \
-    --train-data "$(mhcflurry-downloads path data_kim2014)/bdata.2009.mhci.public.1.txt" \
-    --test-data "$(mhcflurry-downloads path data_kim2014)/bdata.2013.mhci.public.blind.1.txt" \
-    --min-samples-per-allele 50 \
-    --out-cv-results cv.csv \
-    --out-production-results production.csv \
-    --out-models models \
-    --verbose \
-    "$@"
-
-cp $SCRIPT_ABSOLUTE_PATH .
-tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
-
-echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
--- a/downloads-generation/models_class1_allele_specific_single_kim2014_only/README.md
+++ b/downloads-generation/models_class1_allele_specific_single_kim2014_only/README.md
-# Class I allele specific models (single) trained and tested in Kim 2014 dataset
-
-This is a reimplementation of the analysis in [Predicting Peptide-MHC Binding Affinities With Imputed Training Data](http://biorxiv.org/content/early/2016/05/22/054775).
-
--- a/downloads-generation/models_class1_allele_specific_single_kim2014_only/imputer.json
+++ b/downloads-generation/models_class1_allele_specific_single_kim2014_only/imputer.json
-{
-    "imputation_method_name": "mice",
-    "n_burn_in": 5,
-    "n_imputations": 50,
-    "n_nearest_columns": 25,
-    "min_observations_per_peptide": 2,
-    "min_observations_per_allele": 2 
-}
--- a/downloads-generation/models_class1_allele_specific_single_kim2014_only/models.py
+++ b/downloads-generation/models_class1_allele_specific_single_kim2014_only/models.py
-import sys
-from mhcflurry.class1_allele_specific.train import HYPERPARAMETER_DEFAULTS
-import json
-
-models = HYPERPARAMETER_DEFAULTS.models_grid(
-    #impute=[False, True],
-    impute=[False],
-    activation=["tanh"],
-    layer_sizes=[[12], [64], [128]],
-    embedding_output_dim=[8, 32, 64],
-    dropout_probability=[0, .1, .25],
-    # fraction_negative=[0, .1, .2],
-    n_training_epochs=[250])
-
-sys.stderr.write("Models: %d\n" % len(models))
-print(json.dumps(models, indent=4))
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -12,24 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .class1_allele_specific.class1_binding_predictor import (
+from .class1_affinity_prediction.class1_binding_predictor import (
    Class1BindingPredictor)
-from .prediction import predict
-from .affinity_measurement_dataset import AffinityMeasurementDataset
-from .class1_allele_specific_ensemble import Class1EnsembleMultiAllelePredictor
-from .class1_allele_specific import Class1SingleModelMultiAllelePredictor
-from .measurement_collection import MeasurementCollection
-from . import parallelism
+from .class1_affinity_prediction.multi_allele_predictor_ensemble import (
+    MultiAllelePredictorEnsemble)

 __version__ = "0.2.0"

 __all__ = [
    "Class1BindingPredictor",
-    "predict",
-    "parallelism",
-    "AffinityMeasurementDataset",
-    "Class1EnsembleMultiAllelePredictor",
-    "Class1SingleModelMultiAllelePredictor",
-    "MeasurementCollection",
+    "MultiAllelePredictorEnsemble",
    "__version__",
 ]
--- a/mhcflurry/affinity_measurement_dataset.py
+++ b/mhcflurry/affinity_measurement_dataset.py