diff --git a/experiments/allele_similarities.py b/experiments/allele_similarities.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc3def736b3c995a5e228190337e353ac653ca98
--- /dev/null
+++ b/experiments/allele_similarities.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper functions for computing pairwise similarities between alleles
+"""
+
+import numpy as np
+import seaborn
+from fancyimpute import NuclearNormMinimization
+
+from common import matrix_to_dictionary, curry_dictionary
+
+
+def compute_similarities_for_single_allele_from_peptide_overlap(
+        allele_name,
+        allele_to_peptide_to_affinity,
+        allele_name_length=None,
+        min_weight=1.0):
+    """
+    Parameters
+    ----------
+    allele_name : str
+
+    allele_to_peptide_to_affinity : dict
+        Dictionary from allele anmes to a dictionary of (peptide -> affinity)
+
+    Returns three dictionaries, mapping alleles to
+        - similarity
+        - number of overlapping peptides
+        - total weight of overlapping affinitie
+    """
+    sims = {}
+    overlaps = {}
+    weights = {}
+    dataset_a = allele_to_peptide_to_affinity[allele_name]
+    peptide_set_a = set(dataset_a.keys())
+    for other_allele, dataset_b in allele_to_peptide_to_affinity.items():
+        if allele_name_length and len(other_allele) != allele_name_length:
+            continue
+        peptide_set_b = set(dataset_b.keys())
+        intersection = peptide_set_a.intersection(peptide_set_b)
+        overlaps[other_allele] = len(intersection)
+        total = 0.0
+        weight = 0.0
+        for peptide in intersection:
+            ya = dataset_a[peptide]
+            yb = dataset_b[peptide]
+            minval = min(ya, yb)
+            maxval = max(ya, yb)
+            total += minval
+            weight += maxval
+        weights[other_allele] = weight
+        if weight > min_weight:
+            sims[other_allele] = total / weight
+    return sims, overlaps, weights
+
+
+def compute_partial_similarities_from_peptide_overlap(
+        allele_to_peptide_to_affinity,
+        min_weight=1.0,
+        allele_name_length=None):
+    """
+    Determine similarity between pairs of alleles by examining
+    affinity values for overlapping peptides. Returns curried dictionaries
+    mapping allele -> allele -> float, where the final value is one of:
+        - similarity between alleles
+        - # overlapping peptides
+        - sum weight of overlapping affinities
+    """
+    sims = {}
+    overlaps = {}
+    weights = {}
+    for allele_name in allele_to_peptide_to_affinity.keys():
+        if allele_name_length and len(allele_name) != allele_name_length:
+            continue
+        curr_sims, curr_overlaps, curr_weights = \
+            compute_similarities_for_single_allele_from_peptide_overlap(
+                allele_name=allele_name,
+                allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+                allele_name_length=allele_name_length,
+                min_weight=min_weight)
+        sims[allele_name] = curr_sims
+        overlaps[allele_name] = curr_overlaps
+        weights[allele_name] = curr_weights
+    return sims, overlaps, weights
+
+
+def build_incomplete_similarity_matrix(
+        allele_to_peptide_to_affinity,
+        curried_sims_dict):
+    allele_list = list(sorted(allele_to_peptide_to_affinity.keys()))
+    allele_order = {
+        allele_name: i
+        for (i, allele_name) in enumerate(allele_list)
+    }
+    n_alleles = len(allele_list)
+    sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float)
+
+    for a, ai in allele_order.items():
+        for b, bi in allele_order.items():
+            # if allele pair is missing from similarities dict then
+            # fill its slot with NaN, indicating missing data
+            similarity = curried_sims_dict.get(a, {}).get(b, np.nan)
+            sims_incomplete_matrix[(ai, bi)] = similarity
+    return allele_list, allele_order, sims_incomplete_matrix
+
+
+def save_heatmap(matrix, allele_names, filename):
+    seaborn.set_context("paper")
+
+    with seaborn.plotting_context(font_scale=1):
+        figure = seaborn.plt.figure(figsize=(20, 18))
+        ax = figure.add_axes()
+        heatmap = seaborn.heatmap(
+            data=matrix,
+            xticklabels=allele_names,
+            yticklabels=allele_names,
+            linewidths=0,
+            annot=False,
+            ax=ax,
+            fmt=".2g")
+        heatmap.set_xticklabels(labels=allele_names, rotation=45)
+        heatmap.set_yticklabels(labels=allele_names, rotation=45)
+        figure.savefig(filename)
+
+
+def fill_in_similarities(
+        curried_raw_sims_dict,
+        allele_to_peptide_to_affinity,
+        raw_sims_heatmap_path=None,
+        complete_sims_heatmap_path=None,
+        curried_overlap_weights=None,
+        scalar_error_tolerance=0.0001):
+    """
+    Given an incomplete dictionary of pairwise allele similarities and
+    a dictionary of binding data, generate the completed similarities
+    """
+
+    allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix(
+        allele_to_peptide_to_affinity,
+        curried_sims_dict=curried_raw_sims_dict)
+
+    missing = np.isnan(sims_matrix)
+
+    if curried_overlap_weights:
+        error_tolerance = np.ones_like(sims_matrix)
+        for allele_a, a_dict in curried_overlap_weights.items():
+            for allele_b, weight in a_dict.items():
+                i = allele_order[allele_a]
+                j = allele_order[allele_b]
+                error_tolerance[i, j] = 2.0 ** -weight
+        print(
+            "-- Error tolerance distribution: min %f, max %f, median %f" % (
+                error_tolerance[~missing].min(),
+                error_tolerance[~missing].max(),
+                np.median(error_tolerance[~missing])))
+    else:
+        error_tolerance = scalar_error_tolerance
+
+    if raw_sims_heatmap_path:
+        save_heatmap(
+            sims_matrix,
+            allele_list,
+            raw_sims_heatmap_path)
+
+    print("Completing %s similarities matrix with %d missing entries" % (
+        sims_matrix.shape,
+        missing.sum()))
+
+    solver = NuclearNormMinimization(
+        require_symmetric_solution=True,
+        min_value=0.0,
+        max_value=1.0,
+        error_tolerance=error_tolerance)
+
+    sims_complete_matrix = solver.complete(sims_matrix)
+
+    if complete_sims_heatmap_path:
+        save_heatmap(
+            sims_complete_matrix,
+            allele_list,
+            complete_sims_heatmap_path)
+
+    complete_sims_dict = curry_dictionary(
+        matrix_to_dictionary(
+            sims=sims_complete_matrix,
+            allele_list=allele_list))
+    return complete_sims_dict
+
+
+def save_csv(filename, sims, overlap_counts, overlap_weights):
+    """
+    Save pairwise allele similarities in CSV file.
+
+    Assumes the dictionaries sims, overlap_counts, and overlap_weights are
+    'curried', meaning that their keys are single allele names which map
+    to nested dictionaries from other allele names to numeric values.
+    As a type signature, this means the dictionaries are of the form
+        (allele -> allele -> number)
+    and not
+        (allele, allele) -> number
+    """
+    with open(filename, "w") as f:
+        f.write("allele_A,allele_B,similarity,count,weight\n")
+        for (a, a_row) in sorted(sims.items()):
+            a_counts = overlap_counts.get(a, {})
+            a_weights = overlap_weights.get(a, {})
+            for (b, similarity) in sorted(a_row.items()):
+                count = a_counts.get(b, 0)
+                weight = a_weights.get(b, 0.0)
+                f.write("%s,%s,%0.4f,%d,%0.4f\n" % (
+                    a,
+                    b,
+                    similarity,
+                    count,
+                    weight))
+
+
+def compute_allele_similarities(allele_to_peptide_to_affinity, min_weight=0.1):
+    """
+    Compute pairwise allele similarities from binding data,
+    complete similarity matrix for alleles which lack sufficient data,
+    returns:
+        - dictionary of allele similarities
+        - dictionary of # overlapping peptides between alleles
+        - dictionary of "weight" of overlapping peptides between alleles
+    """
+    assert isinstance(allele_to_peptide_to_affinity, dict), \
+        "Wrong type, expected dict but got %s" % (
+            type(allele_to_peptide_to_affinity),)
+    raw_sims_dict, overlap_counts, overlap_weights = \
+        compute_partial_similarities_from_peptide_overlap(
+            allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+            min_weight=min_weight)
+
+    complete_sims_dict = fill_in_similarities(
+        curried_raw_sims_dict=raw_sims_dict,
+        allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+        curried_overlap_weights=overlap_weights)
+
+    return complete_sims_dict, overlap_counts, overlap_weights
diff --git a/experiments/best-synthetic-data-hyperparams.py b/experiments/best-synthetic-data-hyperparams.py
new file mode 100755
index 0000000000000000000000000000000000000000..e1c2c7eac0cc150ef6146b1ccaf4cb4301812490
--- /dev/null
+++ b/experiments/best-synthetic-data-hyperparams.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Find best smoothing coefficient for creating synthetic data. The smoothing
+coefficient adds artifical weight toward low binding affinities, which dominate
+in cases where no similar allele has values for some peptide.
+
+We're determining "best" as most predictive of already known binding affinities,
+averaged across all given alleles.
+"""
+
+import argparse
+from collections import defaultdict
+
+from mhcflurry.data import load_allele_dicts
+from scipy import stats
+import numpy as np
+import sklearn.metrics
+
+from dataset_paths import PETERS2009_CSV_PATH
+from synthetic_data import (
+    synthesize_affinities_for_single_allele,
+    create_reverse_lookup_from_simple_dicts
+)
+from allele_similarities import compute_allele_similarities
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--smoothing-coefs",
+    default=[0.05, 0.025, 0.01, 0.005, 0.0025, 0.001, 0.0005, 0.0001, 0],
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
+    help="Smoothing value used for peptides with low weight across alleles")
+
+parser.add_argument(
+    "--similarity-exponents",
+    default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
+    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
+
+
+def generate_cross_validation_datasets(
+        allele_to_peptide_to_affinity,
+        n_folds=4):
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
+        peptides = list(dataset.keys())
+        affinities = list(dataset.values())
+        n_samples = len(peptides)
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
+            continue
+
+        kfold_iterator = enumerate(
+            KFold(n_samples, n_folds=n_folds, shuffle=True))
+
+        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            train_peptides = [peptides[i] for i in train_indices]
+            train_affinities = [affinities[i] for i in train_indices]
+            test_peptide_set = set([peptides[i] for i in test_indices])
+            # copy the affinity data for all alleles other than this one
+            fold_affinity_dict = {
+                allele_key: affinity_dict
+                for (allele_key, affinity_dict)
+                in allele_to_peptide_to_affinity.items()
+                if allele_key != allele
+            }
+            # include an affinity dictionary for this allele which
+            # only uses training data
+            fold_affinity_dict[allele] = {
+                train_peptide: train_affinity
+                for (train_peptide, train_affinity)
+                in zip(train_peptides, train_affinities)
+            }
+            allele_similarities, overlap_counts, overlap_weights = \
+                compute_allele_similarities(
+                    allele_to_peptide_to_affinity=fold_affinity_dict,
+                    min_weight=0.1)
+            this_allele_similarities = allele_similarities[allele]
+
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
+
+
+def evaluate_synthetic_data(
+        allele_to_peptide_to_affinity,
+        smoothing_coef,
+        exponent,
+        max_ic50,
+        n_folds=4):
+    """
+    Use cross-validation over entries of each allele to determine the predictive
+    accuracy of data synthesis on known affinities.
+    """
+
+    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
+        allele_to_peptide_to_affinity)
+
+    taus = defaultdict(list)
+    f1_scores = defaultdict(list)
+    aucs = defaultdict(list)
+
+    for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
+            generate_cross_validation_datasets(
+                allele_to_peptide_to_affinity,
+                n_folds=n_folds):
+        # create a peptide -> list[(allele, affinity, weight)] dictionary
+        # restricted only to the peptides for which we want to test accuracy
+        test_reverse_lookup = {
+            peptide: triplets
+            for (peptide, triplets) in peptide_to_affinities.items()
+            if peptide in fold_test_peptides
+        }
+
+        synthetic_values = synthesize_affinities_for_single_allele(
+            similarities=fold_sims,
+            peptide_to_affinities=test_reverse_lookup,
+            smoothing=smoothing_coef,
+            exponent=exponent,
+            exclude_alleles=[allele])
+
+        synthetic_peptide_set = set(synthetic_values.keys())
+        # set of peptides for which we have both true and synthetic
+        # affinity values
+        combined_peptide_set = synthetic_peptide_set.intersection(
+            fold_test_peptides)
+
+        combined_peptide_list = list(sorted(combined_peptide_set))
+
+        if len(combined_peptide_list) < 2:
+            print("Too few peptides in combined set %s for fold %d/%d of %s" % (
+                combined_peptide_list,
+                fold_num + 1,
+                n_folds,
+                allele))
+            continue
+
+        synthetic_affinity_list = [
+            synthetic_values[peptide]
+            for peptide in combined_peptide_list
+        ]
+
+        true_affinity_list = [
+            allele_to_peptide_to_affinity[allele][peptide]
+            for peptide in combined_peptide_list
+        ]
+        assert len(true_affinity_list) == len(synthetic_affinity_list)
+        if all(x == true_affinity_list[0] for x in true_affinity_list):
+            continue
+
+        tau, _ = stats.kendalltau(
+            synthetic_affinity_list,
+            true_affinity_list)
+        assert not np.isnan(tau)
+        print("==> %s (CV fold %d/%d) tau = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            tau))
+        taus[allele].append(tau)
+
+        true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
+        predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
+
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        print("==> %s (CV fold %d/%d) AUC = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            auc))
+        aucs[allele].append(auc)
+
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        print("==> %s (CV fold %d/%d) F1 = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            f1_score))
+        f1_scores[allele].append(f1_score)
+
+    return taus, aucs, f1_scores
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        regression_output=True)
+
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_to_peptide_to_affinity.values()
+    )
+    print("Loaded %d binding values for %d alleles" % (
+        n_binding_values, len(allele_to_peptide_to_affinity)))
+
+    results = {}
+    for smoothing_coef in args.smoothing_coefs:
+        for exponent in args.similarity_exponents:
+            taus, aucs, f1_scores = evaluate_synthetic_data(
+                allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+                smoothing_coef=smoothing_coef,
+                exponent=exponent,
+                max_ic50=args.max_ic50)
+            allele_keys = list(sorted(taus.keys()))
+            tau_allele_means = [
+                np.mean(taus[allele])
+                for allele in allele_keys
+                if taus[allele]
+            ]
+            auc_allele_means = [
+                np.mean(aucs[allele])
+                for allele in allele_keys
+                if aucs[allele]
+            ]
+
+            f1_score_means = [
+                np.mean(f1_scores[allele])
+                for allele in allele_keys
+                if f1_scores[allele]
+            ]
+
+            median_tau = np.median(tau_allele_means)
+            median_f1 = np.median(f1_score_means)
+            median_auc = np.median(auc_allele_means)
+            print("\n\n::::::\n")
+            print(
+                "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
+                    exponent,
+                    smoothing_coef,
+                    median_tau,
+                    median_auc,
+                    median_f1))
+            print("\n^^^^^^\n")
+
+            scores = (median_tau, median_auc, median_f1)
+            results[(exponent, smoothing_coef)] = scores
+
+    print("===")
+
+    def combine_item_scores(x):
+        return x[1][0] * x[1][1] * x[1][2]
+    for ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) in \
+            sorted(results.items(), key=combine_item_scores):
+        print("-- exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+            best_exponent,
+            best_coef,
+            median_tau,
+            median_auc,
+            median_f1))
+
+    ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
+        results.items(),
+        key=combine_item_scores)
+    print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+        best_exponent,
+        best_coef,
+        median_tau,
+        median_auc,
+        median_f1))
diff --git a/experiments/common.py b/experiments/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..95800f82d9b7e9c9bee9fa56c805f960d08725a4
--- /dev/null
+++ b/experiments/common.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import defaultdict
+import pandas as pd
+
+
+def curry_dictionary(key_pair_dict, default_value=0.0):
+    """
+    Transform dictionary from pairs of keys to dict -> dict -> float
+    """
+    result = defaultdict(dict)
+    for (a, b), value in key_pair_dict.items():
+        result[a][b] = value
+    return result
+
+
+def uncurry_dictionary(curried_dict):
+    """
+    Transform dictionary from (key_a -> key_b -> float) to
+    (key_a, key_b) -> float
+    """
+    result = {}
+    for a, a_dict in curried_dict.items():
+        for b, value in a_dict.items():
+            result[(a, b)] = value
+    return result
+
+
+def matrix_to_dictionary(sims, allele_list):
+    sims_dict = {}
+    for i in range(sims.shape[0]):
+        a = allele_list[i]
+        for j in range(sims.shape[1]):
+            b = allele_list[j]
+            sims_dict[a, b] = sims[i, j]
+    return sims_dict
+
+
+def load_csv_binding_data_as_dict(
+        csv_path,
+        mhc_column_name="mhc",
+        peptide_column_name="sequence",
+        ic50_column_name="ic50"):
+    """
+    Given a path to a CSV file containing peptide-MHC binding data,
+    load it as a dictionary mapping alleles to a dictionary peptide->IC50
+    """
+    df = pd.read_csv(csv_path)
+    print("-- Read %d rows from %s" % (len(df), csv_path))
+    return {
+        allele: {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(group[peptide_column_name], group[ic50_column_name])
+        }
+        for allele, group in df.groupby(mhc_column_name)
+    }
diff --git a/experiments/compute-allele-similarities.py b/experiments/compute-allele-similarities.py
old mode 100644
new mode 100755
index 290e4a7ab50a9b2fb07d99b39f30c8e923d2bc5b..833bfd6a67513927cebb2d9a74081b24933d1750
--- a/experiments/compute-allele-similarities.py
+++ b/experiments/compute-allele-similarities.py
@@ -1,10 +1,29 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 
-import fancyimpute
-import numpy as np
 import mhcflurry
 
 from dataset_paths import PETERS2009_CSV_PATH
+from allele_similarities import (
+    compute_partial_similarities_from_peptide_overlap,
+    fill_in_similarities,
+    save_csv
+)
 
 parser = argparse.ArgumentParser()
 
@@ -13,204 +32,96 @@ parser.add_argument(
     default=PETERS2009_CSV_PATH)
 
 parser.add_argument(
-    "--fill-missing-values",
-    action="store_true",
-    default=False)
+    "--min-overlap-weight",
+    default=0.05,
+    help="Minimum overlap weight between pair of alleles",
+    type=float)
 
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
 
 parser.add_argument(
-    "--min-overlap-weight",
-    default=2.0,
-    help="Minimum overlap weight between pair of alleles")
+    "--only-human",
+    default=False,
+    action="store_true")
 
 parser.add_argument(
-    "--max-ic50",
-    default=50000.0)
+    "--raw-similarities-output-path",
+    help="CSV file which contains incomplete allele similarities")
 
 parser.add_argument(
-    "--output-csv",
-    required=True)
-
-def binary_encode(X, n_indices=20):
-    n_cols = X.shape[1]
-    X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float)
-    for i in range(len(X)):
-        for col_idx in range(n_cols):
-            X_encode[i, col_idx*n_indices + X[i, col_idx]] = True
-    return X_encode
-
-def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5):
-    if allele_name_length:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length}
-    if prefix:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)}
-    if len(datasets_dict) == 0:
-        raise ValueError("No alleles matched criteria")
-    all_peptides = set([])
-    allele_list = []
-    for k,d in sorted(datasets_dict.items()):
-        print(k, len(d.peptides))
-        allele_list.append(k)
-        for p in d.peptides:
-            all_peptides.add(p)
-    peptide_list = list(sorted(all_peptides))
-    allele_order = {a:i for (i,a) in enumerate(allele_list)}
-    peptide_order = {p:i for (i,p) in enumerate(peptide_list)}
-    n_alleles = len(allele_order)
-    n_peptides = len(peptide_order)
-    incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan
-    for k,d in datasets_dict.items():
-        if k not in allele_order:
-            continue
-        i = allele_order[k]
-        for p,y in zip(d.peptides, d.Y):
-            j = peptide_order[p]
-            incomplete_affinity_matrix[i,j] = y
-    print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides))
-    return incomplete_affinity_matrix, allele_order, peptide_order
-
-def get_extra_data(allele, train_datasets, expanded_predictions):
-    original_dataset = train_datasets[allele]
-    original_peptides = set(original_dataset.peptides)
-    expanded_allele_affinities = expanded_predictions[allele]
-    extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides}
-    extra_peptides = list(extra_affinities.keys())
-    extra_values = list(extra_affinities.values())
-    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
-    extra_Y = np.array(extra_values)
-    return extra_X, extra_Y
-
-def WHO_KNOWS_WHAT():
-
-
-    smoothing = 0.005
-    exponent = 2
-    all_predictions = defaultdict(dict)
-    for allele, allele_idx in allele_order.items():
-        for peptide in peptide_order.keys():
-            predictions = reverse_lookups[peptide]
-            total_value = 0.0
-            total_denom = 0.0
-            for (other_allele, y) in predictions:
-                key = (allele,other_allele)
-                if key not in completed_sims_dict:
-                    continue
-                sim = completed_sims_dict[key]
-                weight = sim ** exponent
-                total_value += weight * y
-                total_denom += weight
-            if total_denom > 0.0:
-                all_predictions[allele][peptide] = total_value / (smoothing + total_denom)
-
-
-def data_augmentation(X, Y, extra_X, extra_Y,
-                      fraction=0.5,
-                      niters=10,
-                      extra_sample_weight=0.1,
-                      nb_epoch=50,
-                      nn=True,
-                      hidden_layer_size=5):
-    n = len(Y)
-    aucs = []
-    f1s = []
-    n_originals = []
-    for _ in range(niters):
-        mask = np.random.rand(n) <= fraction
-        X_train = X[mask]
-        X_test = X[~mask]
-        Y_train = Y[mask]
-        Y_test = Y[~mask]
-        test_ic50 = 20000 ** (1-Y_test)
-        test_label = test_ic50 <= 500
-        if test_label.all() or not test_label.any():
-            continue
-        n_original = mask.sum()
-        print("Keeping %d original training samples" % n_original)
-        X_train_combined = np.vstack([X_train, extra_X])
-        Y_train_combined = np.concatenate([Y_train, extra_Y])
-        print("Combined shape: %s" % (X_train_combined.shape,))
-        assert len(X_train_combined) == len(Y_train_combined)
-        # initialize weights to count synthesized and actual data equally
-        # but weight on synthesized points will decay across training epochs
-        weight = np.ones(len(Y_train_combined))
-        if nn:
-            model = mhcflurry.feedforward.make_embedding_network(
-                layer_sizes=[hidden_layer_size],
-                embedding_output_dim=10,
-                activation="tanh")
-            for i in range(nb_epoch):
-                # decay weight as training progresses
-                weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2
-                model.fit(
-                    X_train_combined,
-                    Y_train_combined,
-                    sample_weight=weight,
-                    shuffle=True,
-                    nb_epoch=1,
-                    verbose=0)
-            pred = model.predict(X_test)
-        else:
-            model = sklearn.linear_model.Ridge(alpha=5)
-            X_train_combined_binary = binary_encode(X_train_combined)
-            X_test_binary = binary_encode(X_test)
-            model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight)
-            pred = model.predict(X_test_binary)
-        pred_ic50 = 20000 ** (1-pred)
-        pred_label =  pred_ic50 <= 500
-        mse = sklearn.metrics.mean_squared_error(Y_test, pred)
-        auc = sklearn.metrics.roc_auc_score(test_label, pred)
-
-        if pred_label.all() or not pred_label.any():
-            f1 = 0
-        else:
-            f1 = sklearn.metrics.f1_score(test_label, pred_label)
-        print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1))
-        n_originals.append(n_original)
-        aucs.append(auc)
-        f1s.append(f1)
-    return aucs, f1s, n_originals
+    "--complete-similarities-output-path",
+    help="CSV file which contains allele similarities after matrix completion")
+
+parser.add_argument(
+    "--raw-heatmap-output-path",
+    help="PNG file to save heatmap of incomplete similarities matrix")
+
+parser.add_argument(
+    "--complete-heatmap-output-path",
+    help="PNG file to save heatmap of complete similarities matrix")
+
+
+def print_dataset_sizes(allele_groups):
+    print("---\nDataset Sizes\n---")
+    for (allele_name, g) in sorted(allele_groups.items()):
+        print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % (
+            allele_name,
+            len(g),
+            sum(len(k) == 8 for k in g.keys()),
+            sum(len(k) == 9 for k in g.keys()),
+            sum(len(k) == 10 for k in g.keys()),
+            sum(len(k) == 11 for k in g.keys()),
+        ))
+    print("---")
 
 
 if __name__ == "__main__":
     args = parser.parse_args()
-
-    datasets = mhcflurry.data_helpers.load_data(
+    print(args)
+    df, peptide_column_name = mhcflurry.data.load_dataframe(
         args.binding_data_csv,
-        binary_encoding=True,
-        max_ic50=args.max_ic50)
-
-    sims = {}
-    overlaps = {}
-    weights = {}
-    for allele_name_a, da in train_datasets.items():
-        if len(allele_name_a) != 5:
-            continue
-        y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)}
-        seta = set(da.peptides)
-        for allele_name_b, db in train_datasets.items():
-            if len(allele_name_b) != 5:
-                continue
-            y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)}
-            setb = set(db.peptides)
-            intersection = seta.intersection(setb)
-            overlaps[a,b] = len(intersection)
-            total = 0.0
-            weight = 0.0
-            for peptide in intersection:
-                ya = y_dict_a[peptide]
-                yb = y_dict_b[peptide]
-                minval = min(ya, yb)
-                maxval = max(ya, yb)
-                total += minval
-                weight += maxval
-            weights[a,b] = weight
-            if weight > min_weight:
-                sims[allele_name_a, allele_name_b] = total / weight
-            else:
-                sims[allele_name_a, allele_name_b] = np.nan
-
-    if args.fill_missing_values:
-        sims_array = np.zeros((n_alleles, n_alleles), dtype=float)
-        for i, a in enumerate(allele_names):
-            for j, b in enumerate(allele_names):
-                sims_array[i,j] = sims[(a,b)]
+        max_ic50=args.max_ic50,
+        only_human=args.only_human)
+    allele_groups = {
+        allele_name: {
+            row[peptide_column_name]: row["regression_output"]
+            for (_, row) in group.iterrows()
+        }
+        for (allele_name, group) in df.groupby("mhc")
+    }
+    print_dataset_sizes(allele_groups)
+
+    raw_sims_dict, overlap_counts, overlap_weights = \
+        compute_partial_similarities_from_peptide_overlap(
+            allele_groups,
+            min_weight=args.min_overlap_weight)
+
+    if args.raw_similarities_output_path:
+        save_csv(
+            args.raw_similarities_output_path,
+            raw_sims_dict,
+            overlap_counts,
+            overlap_weights)
+
+    complete_sims_dict = fill_in_similarities(
+        curried_raw_sims_dict=raw_sims_dict,
+        allele_to_peptide_to_affinity=allele_groups,
+        raw_sims_heatmap_path=args.raw_heatmap_output_path,
+        complete_sims_heatmap_path=args.complete_heatmap_output_path)
+
+    n_total_entries = sum(len(row) for row in complete_sims_dict.values())
+    n_original_entries = sum(len(row) for row in raw_sims_dict.values())
+    print("-- Added %d/%d allele similarities" % (
+        n_total_entries - n_original_entries,
+        n_total_entries
+    ))
+
+    if args.complete_similarities_output_path:
+        save_csv(
+            args.complete_similarities_output_path,
+            complete_sims_dict,
+            overlap_counts,
+            overlap_weights)
diff --git a/experiments/cross_validation_affinity_data.py b/experiments/cross_validation_affinity_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01fc6d93a30fdeb2fc974394fee3bfd31d58c60
--- /dev/null
+++ b/experiments/cross_validation_affinity_data.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sklearn.cross_validation import KFold
+
+
+def generate_cross_validation_datasets(
+        allele_to_peptide_to_affinity,
+        n_folds=4):
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
+        peptides = list(dataset.keys())
+        affinities = list(dataset.values())
+        n_samples = len(peptides)
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
+            continue
+
+        kfold_iterator = enumerate(
+            KFold(n_samples, n_folds=n_folds, shuffle=True))
+
+        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            train_peptides = [peptides[i] for i in train_indices]
+            train_affinities = [affinities[i] for i in train_indices]
+            test_peptide_set = set([peptides[i] for i in test_indices])
+            # copy the affinity data for all alleles other than this one
+            fold_affinity_dict = {
+                allele_key: affinity_dict
+                for (allele_key, affinity_dict)
+                in allele_to_peptide_to_affinity.items()
+                if allele_key != allele
+            }
+            # include an affinity dictionary for this allele which
+            # only uses training data
+            fold_affinity_dict[allele] = {
+                train_peptide: train_affinity
+                for (train_peptide, train_affinity)
+                in zip(train_peptides, train_affinities)
+            }
+            allele_similarities, overlap_counts, overlap_weights = \
+                compute_allele_similarities(
+                    allele_to_peptide_to_affinity=fold_affinity_dict,
+                    min_weight=0.1)
+            this_allele_similarities = allele_similarities[allele]
+
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
\ No newline at end of file
diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fef8576f2cc836411972aff280d25c829730430
--- /dev/null
+++ b/experiments/matrix-completion-accuracy.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from fancyimpute import (
+    SoftImpute,
+    IterativeSVD,
+    SimilarityWeightedAveraging,
+    KNN,
+    MICE,
+    BiScaler,
+    SimpleFill,
+)
+import numpy as np
+import pandas as pd
+
+from dataset_paths import PETERS2009_CSV_PATH
+from score_set import ScoreSet
+from matrix_completion_helpers import (
+    load_data,
+    evaluate_predictions,
+    stratified_cross_validation
+)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--save-incomplete-affinity-matrix",
+    default=None,
+    help="Path to CSV which will contains the incomplete affinity matrix")
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--output-file",
+    default="matrix-completion-accuracy-results.csv")
+
+parser.add_argument(
+    "--normalize-columns",
+    default=False,
+    action="store_true",
+    help="Center and rescale columns of affinity matrix")
+
+
+parser.add_argument(
+    "--normalize-rows",
+    default=False,
+    action="store_true",
+    help="Center and rescale rows of affinity matrix")
+
+parser.add_argument(
+    "--min-observations-per-peptide",
+    type=int,
+    default=2,
+    help="Drop peptide entries with fewer than this number of measured affinities")
+
+parser.add_argument(
+    "--n-folds",
+    default=10,
+    type=int,
+    help="Number of cross-validation folds")
+
+parser.add_argument(
+    "--verbose",
+    default=False,
+    action="store_true")
+
+
+def create_imputation_methods(
+        verbose=False,
+        clip_imputed_values=False,
+        knn_print_interval=20,
+        knn_params=[1, 3, 5],
+        softimpute_params=[1, 5, 10],
+        svd_params=[5, 10, 20]):
+    min_value = 0 if clip_imputed_values else None
+    max_value = 1 if clip_imputed_values else None
+    result_dict = {
+        "meanFill": SimpleFill("mean"),
+        "zeroFill": SimpleFill("zero"),
+        "mice": MICE(
+            n_burn_in=5,
+            n_imputations=25,
+            min_value=min_value,
+            max_value=max_value,
+            verbose=verbose),
+        "similarityWeightedAveraging": SimilarityWeightedAveraging(
+            orientation="columns",
+            verbose=verbose),
+    }
+    for threshold in softimpute_params:
+        result_dict["softImpute-%d" % threshold] = SoftImpute(
+            threshold,
+            verbose=verbose,
+            min_value=min_value,
+            max_value=max_value)
+    for rank in svd_params:
+        result_dict["svdImpute-%d" % rank] = IterativeSVD(
+            rank,
+            verbose=verbose,
+            min_value=min_value,
+            max_value=max_value)
+    for k in knn_params:
+        result_dict["knnImpute-%d" % k] = KNN(
+            k,
+            orientation="columns",
+            verbose=verbose,
+            print_interval=knn_print_interval)
+    return result_dict
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    imputation_methods = create_imputation_methods(
+        verbose=args.verbose,
+        clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
+    )
+    print("Imputation methods: %s" % imputation_methods)
+
+    X, peptide_list, allele_list = load_data(
+        binding_data_csv=args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        min_observations_per_allele=args.n_folds,
+        min_observations_per_peptide=args.min_observations_per_peptide)
+    observed_mask = np.isfinite(X)
+    print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
+        X.shape,
+        observed_mask.sum(),
+        X.size,
+        100.0 * observed_mask.sum() / X.size))
+    if args.save_incomplete_affinity_matrix:
+        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
+        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
+        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
+
+    scores = ScoreSet()
+    kfold = stratified_cross_validation(
+        X=X,
+        observed_mask=observed_mask,
+        n_folds=args.n_folds)
+    for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
+        X_fold_reduced = X_fold[ok_mesh]
+        biscaler = BiScaler(
+            scale_rows=args.normalize_rows,
+            center_rows=args.normalize_rows,
+            scale_columns=args.normalize_columns,
+            center_columns=args.normalize_columns)
+        X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
+        for (method_name, solver) in sorted(imputation_methods.items()):
+            print("CV fold %d/%d, running %s" % (
+                fold_idx + 1,
+                args.n_folds,
+                method_name))
+            X_completed_reduced_scaled = solver.complete(X_fold_reduced)
+            X_completed_reduced = biscaler.inverse_transform(
+                X_completed_reduced_scaled)
+            X_completed = np.zeros_like(X)
+            X_completed[ok_mesh] = X_completed_reduced
+            y_pred = X_completed[test_coords]
+            mae, tau, auc, f1_score = evaluate_predictions(
+                y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50)
+            scores.add_many(
+                method_name,
+                mae=mae,
+                tau=tau,
+                f1_score=f1_score,
+                auc=auc)
+    scores.to_csv(args.output_file)
diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py
new file mode 100755
index 0000000000000000000000000000000000000000..3d224bb8e7808cced4d5dd7324904a6633368f59
--- /dev/null
+++ b/experiments/matrix-completion-hyperparameter-search.py
@@ -0,0 +1,507 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+For each allele, perform 5-fold cross validation to
+    (1) complete the matrix of affinities to generate auxilliary training data
+    (2) train multiple hyperparameter to find the best hyperparameters for
+        each network
+"""
+
+import argparse
+from collections import defaultdict
+
+from fancyimpute import MICE, KNN, SimpleFill, IterativeSVD, SoftImpute
+import numpy as np
+import pandas as pd
+from mhcflurry.peptide_encoding import fixed_length_index_encoding
+from mhcflurry.amino_acid import (
+    common_amino_acids,
+    amino_acids_with_unknown,
+)
+from mhcflurry import Class1BindingPredictor
+from sklearn.cross_validation import StratifiedKFold
+
+from dataset_paths import PETERS2009_CSV_PATH
+
+from matrix_completion_helpers import load_data, evaluate_predictions
+
+from arg_parsing import parse_int_list, parse_float_list, parse_string_list
+
+from matrix_completion_helpers import prune_data
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--save-incomplete-affinity-matrix",
+    default=None,
+    help="Path to CSV which will contains the incomplete affinity matrix")
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--output-file",
+    default="hyperparameter-matrix-completion-results.csv")
+
+parser.add_argument(
+    "--n-folds",
+    default=5,
+    type=int,
+    help="Number of cross-validation folds")
+
+parser.add_argument(
+    "--max-training-peptide-length",
+    type=int,
+    default=15)
+
+parser.add_argument(
+    "--verbose",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--embedding-dim-sizes",
+    default=[10, 20, 40],
+    type=parse_int_list)
+
+parser.add_argument(
+    "--first-hidden-layer-sizes",
+    default=[25, 50, 100],
+    type=parse_int_list)
+
+
+parser.add_argument(
+    "--second-hidden-layer-sizes",
+    default=[0],
+    type=parse_int_list)
+
+
+parser.add_argument(
+    "--dropouts",
+    default=[0.0],
+    type=parse_float_list)
+
+parser.add_argument(
+    "--activation-functions",
+    default=["tanh"],
+    type=lambda s: [si.strip() for si in s.split(",")])
+
+parser.add_argument(
+    "--training-epochs",
+    type=int,
+    default=100)
+
+parser.add_argument(
+    "--impute",
+    default="mice",
+    help="Use {'mice', 'knn', 'meanfill', 'none', 'svt', 'svd'} for imputing pre-training data")
+
+parser.add_argument("--batch-size", type=int, default=64)
+
+parser.add_argument(
+    "--unknown-amino-acids",
+    default=False,
+    action="store_true",
+    help="When expanding 8mers into 9mers use 'X' instead of all possible AAs")
+
+parser.add_argument(
+    "--pretrain-only-9mers",
+    default=False,
+    action="store_true",
+    help="Exclude expanded samples from e.g. 8mers or 10mers")
+
+parser.add_argument(
+    "--alleles",
+    default=None,
+    type=parse_string_list,
+    help="Only evaluate these alleles (by default use all in the dataset)")
+
+parser.add_argument("--min-samples-per-cv-fold", type=int, default=5)
+
+
+def print_length_distribution(peptides, values, name):
+    print("Length distribution for %s (n=%d):" % (name, len(peptides)))
+    grouped_affinity_dict = defaultdict(list)
+    for p, v in zip(peptides, values):
+        grouped_affinity_dict[len(p)].append(v)
+    for length, affinities in sorted(grouped_affinity_dict.items()):
+        print("%d => %d (mean affinity %0.4f)" % (
+            length,
+            len(affinities),
+            np.mean(affinities)))
+
+
+def print_full_dataset_length_distribution(
+        pMHC_affinity_matrix,
+        observed_mask,
+        row_peptide_sequences):
+    row_idx, col_idx = np.where(observed_mask)
+    all_observed_peptides = []
+    all_observed_values = []
+
+    for i, j in zip(row_idx, col_idx):
+        all_observed_peptides.append(row_peptide_sequences[i])
+        all_observed_values.append(pMHC_affinity_matrix[i, j])
+
+    print_length_distribution(all_observed_peptides, all_observed_values, "ALL")
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    # initially load all the data, we're going to later prune it for matrix
+    # completion
+    pMHC_affinity_matrix, peptide_list, allele_list = load_data(
+        binding_data_csv=args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1)
+    observed_mask = np.isfinite(pMHC_affinity_matrix)
+    print_full_dataset_length_distribution(
+        pMHC_affinity_matrix=pMHC_affinity_matrix,
+        observed_mask=observed_mask,
+        row_peptide_sequences=peptide_list)
+
+    n_observed_per_allele = observed_mask.sum(axis=0)
+
+    print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
+        pMHC_affinity_matrix.shape,
+        observed_mask.sum(),
+        pMHC_affinity_matrix.size,
+        100.0 * observed_mask.sum() / pMHC_affinity_matrix.size))
+
+    if args.verbose:
+        print("\n----\n# observed per allele:")
+        for allele_idx, allele in sorted(enumerate(allele_list), key=lambda x: x[1]):
+            print("--- %s: %d" % (allele, n_observed_per_allele[allele_idx]))
+
+    if args.save_incomplete_affinity_matrix:
+        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
+        df = pd.DataFrame(pMHC_affinity_matrix, columns=allele_list, index=peptide_list)
+        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
+
+    if args.output_file:
+        output_file = open(args.output_file, "w")
+        fields = [
+            "allele",
+            "cv_fold",
+            "peptide_count",
+            "sample_count",
+            "dropout_probability",
+            "embedding_dim_size",
+            "hidden_layer_size1",
+            "hidden_layer_size2",
+            "activation",
+            "mae",
+            "tau",
+            "auc",
+            "f1"
+        ]
+        header_line = ",".join(fields)
+        output_file.write(header_line + "\n")
+    else:
+        output_file = None
+    if args.unknown_amino_acids:
+        index_encoding = amino_acids_with_unknown.index_encoding
+    else:
+        index_encoding = common_amino_acids.index_encoding
+
+    impute_method_name = args.impute.lower()
+    if impute_method_name.startswith("mice"):
+        imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1)
+    elif impute_method_name.startswith("knn"):
+        imputer = KNN(k=1, orientation="columns", print_interval=10)
+    elif impute_method_name.startswith("svd"):
+        imputer = IterativeSVD(rank=10)
+    elif impute_method_name.startswith("svt"):
+        imputer = SoftImpute()
+    elif impute_method_name.startswith("mean"):
+        imputer = SimpleFill("mean")
+    elif impute_method_name == "none":
+        imputer = None
+    else:
+        raise ValueError("Invalid imputation method: %s" % impute_method_name)
+
+    predictors = {}
+    initial_weights = {}
+    initial_optimizer_states = {}
+
+    def generate_predictors():
+        """
+        Generator of all possible predictors generated by combinations of
+        hyperparameters.
+
+        To avoid recompiling and initializing a lot of neural networks
+        just save all the models up front along with their initial weight matrices
+        """
+        for dropout in args.dropouts:
+            for embedding_dim_size in args.embedding_dim_sizes:
+                for hidden_layer_size1 in args.first_hidden_layer_sizes:
+                    for hidden_layer_size2 in args.second_hidden_layer_sizes:
+                        for activation in args.activation_functions:
+                            key = "%0.2f,%d,%d,%d,%s" % (
+                                dropout,
+                                embedding_dim_size,
+                                hidden_layer_size1,
+                                hidden_layer_size2,
+                                activation
+                            )
+                            if key not in predictors:
+                                layer_sizes = [hidden_layer_size1]
+                                if hidden_layer_size2:
+                                    layer_sizes.append(hidden_layer_size2)
+
+                                predictor = Class1BindingPredictor.from_hyperparameters(
+                                    embedding_output_dim=embedding_dim_size,
+                                    layer_sizes=layer_sizes,
+                                    activation=activation,
+                                    output_activation="sigmoid",
+                                    dropout_probability=dropout,
+                                    verbose=args.verbose,
+                                    allow_unknown_amino_acids=args.unknown_amino_acids,
+                                    embedding_input_dim=21 if args.unknown_amino_acids else 20,
+                                )
+                                weights = predictor.model.get_weights()
+                                opt_state = predictor.model.optimizer.get_state()
+                                predictors[key] = predictor
+                                initial_weights[key] = weights
+                                initial_optimizer_states[key] = opt_state
+                            else:
+                                predictor = predictors[key]
+                                weights = initial_weights[key]
+                                opt_state = initial_optimizer_states[key]
+                                # reset the predictor to its initial condition
+                                predictor.model.set_weights([
+                                    w.copy() for w in weights])
+                                predictor.model.optimizer.set_state([
+                                    s.copy() for s in opt_state])
+                            yield (key, predictor)
+
+    min_samples_per_allele = args.min_samples_per_cv_fold * args.n_folds
+    for allele_idx, allele in enumerate(allele_list):
+        if args.alleles and not any(
+                pattern in allele for pattern in args.alleles):
+            # if user specifies an allele list then skip anything which isn't included
+            continue
+        if n_observed_per_allele[allele_idx] < min_samples_per_allele:
+            print("-- Skipping allele %s which only has %d samples (need %d)" % (
+                allele,
+                n_observed_per_allele[allele_idx],
+                min_samples_per_allele))
+            continue
+        column = pMHC_affinity_matrix[:, allele_idx]
+        observed_row_indices = np.where(observed_mask[:, allele_idx])[0]
+
+        # drop indices which are for peptides that are extremely long,
+        # like a 26-mer
+        observed_row_indices = np.array([
+            i
+            for i in observed_row_indices
+            if len(peptide_list[i]) <= args.max_training_peptide_length
+        ])
+        observed_values = column[observed_row_indices]
+        std = np.std(observed_values)
+        if std < 0.001:
+            print("-- Skipping allele %s due to insufficient variance in affinities" % (
+                allele))
+            continue
+
+        print(
+            "Evaluating allele %s (n=%d)" % (
+                allele,
+                len(observed_row_indices)))
+        ic50_values = args.max_ic50 ** (1 - observed_values)
+
+        below_500nM = ic50_values <= 500
+        if below_500nM.all():
+            print("-- Skipping %s due to all negative predictions" % allele)
+            continue
+        if below_500nM.sum() < args.n_folds:
+            print("-- Skipping %s due to insufficient positive examples (%d)" % (
+                allele,
+                below_500nM.sum()))
+            continue
+
+        observed_peptides = [peptide_list[i] for i in observed_row_indices]
+        print_length_distribution(observed_peptides, observed_values, name=allele)
+
+        # k-fold cross validation stratified to keep an even balance of low
+        # vs. high-binding peptides in each fold
+        for fold_idx, (train_indices, test_indices) in enumerate(StratifiedKFold(
+                y=below_500nM,
+                n_folds=args.n_folds,
+                shuffle=True)):
+
+            train_peptides_fold = [observed_peptides[i] for i in train_indices]
+            train_values_fold = [observed_values[i] for i in train_indices]
+            train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)}
+
+            test_peptides = [observed_peptides[i] for i in test_indices]
+            test_values = [observed_values[i] for i in test_indices]
+            test_dict = {k: v for (k, v) in zip(test_peptides, test_values)}
+            if imputer is None:
+                X_pretrain = np.array([], dtype=int).reshape((0, 9))
+                Y_pretrain = np.array([], dtype=float)
+                pretrain_sample_weights = np.array([], dtype=float)
+            else:
+                # drop the test peptides from the full matrix and then
+                # run completion on it to get synthesized affinities
+                pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy()
+                test_indices_among_all_rows = observed_row_indices[test_indices]
+                pMHC_affinities_fold_incomplete[
+                    test_indices_among_all_rows, allele_idx] = np.nan
+
+                # drop peptides with fewer than 2 measurements and alleles
+                # with fewer than 10 peptides
+                pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \
+                    prune_data(
+                        X=pMHC_affinities_fold_incomplete,
+                        peptide_list=peptide_list,
+                        allele_list=allele_list,
+                        min_observations_per_peptide=2,
+                        min_observations_per_allele=args.min_samples_per_cv_fold)
+
+                pMHC_affinities_fold_pruned_imputed = imputer.complete(
+                    pMHC_affinities_fold_pruned)
+
+                if args.pretrain_only_9mers:
+                    # if we're expanding 8mers to >100 9mers
+                    # then the pretraining dataset becomes
+                    # unmanageably large, so let's only use 9mers for pre-training
+
+                    # In the future: we'll use an neural network that
+                    # takes multiple input lengths
+                    ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold])
+                    ninemer_indices = np.where(ninemer_mask)[0]
+                    pMHC_affinities_fold_pruned_imputed = \
+                        pMHC_affinities_fold_pruned_imputed[ninemer_indices]
+                    pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices]
+
+                # since we may have dropped some of the columns in the completed
+                # pMHC matrix need to find which column corresponds to the
+                # same name we're currently predicting
+                if allele in pruned_alleles_fold:
+                    pMHC_fold_allele_idx = pruned_alleles_fold.index(allele)
+                    pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[
+                        :, pMHC_fold_allele_idx]
+                    if pMHC_allele_values.std() == 0:
+                        print("WARNING: unexpected zero-variance in pretraining affinity values")
+                else:
+                    print(
+                        "WARNING: Couldn't find allele %s in pre-training matrix" % allele)
+                    column = pMHC_affinities_fold_incomplete[:, allele_idx]
+                    column_mean = np.nanmean(column)
+                    print("-- Setting pre-training target value to nanmean = %f" % column_mean)
+                    pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean
+
+                assert len(pruned_peptides_fold) == len(pMHC_allele_values)
+
+                # dictionary mapping peptides to imputed affinity values
+                pretrain_dict = {
+                    pi: yi
+                    for (pi, yi)
+                    in zip(pruned_peptides_fold, pMHC_allele_values)
+                }
+
+                X_pretrain, pretrain_row_peptides, pretrain_counts = \
+                    fixed_length_index_encoding(
+                        peptides=pruned_peptides_fold,
+                        desired_length=9,
+                        allow_unknown_amino_acids=args.unknown_amino_acids)
+                pretrain_sample_weights = 1.0 / np.array(pretrain_counts)
+                Y_pretrain = np.array(
+                    [pretrain_dict[p] for p in pretrain_row_peptides])
+
+            X_train, training_row_peptides, training_counts = \
+                fixed_length_index_encoding(
+                    peptides=train_peptides_fold,
+                    desired_length=9,
+                    allow_unknown_amino_acids=args.unknown_amino_acids)
+            training_sample_weights = 1.0 / np.array(training_counts)
+            Y_train = np.array([train_dict_fold[p] for p in training_row_peptides])
+
+            n_pretrain = len(X_pretrain)
+            n_train_unique = len(train_peptides_fold)
+            n_train = len(X_train)
+            for key, predictor in generate_predictors():
+                print("\n-----")
+                print(
+                    ("Training CV fold %d/%d for %s "
+                     "(# peptides = %d, # samples=%d, # pretrain samples=%d)"
+                     " with parameters: %s") % (
+                        fold_idx + 1,
+                        args.n_folds,
+                        allele,
+                        n_train_unique,
+                        n_train,
+                        n_pretrain,
+                        key))
+                print("-----")
+                predictor.fit(
+                    X=X_train,
+                    Y=Y_train,
+                    sample_weights=training_sample_weights,
+                    X_pretrain=X_pretrain,
+                    Y_pretrain=Y_pretrain,
+                    pretrain_sample_weights=pretrain_sample_weights,
+                    n_training_epochs=args.training_epochs,
+                    verbose=args.verbose,
+                    batch_size=args.batch_size)
+                y_pred = predictor.predict_peptides(test_peptides)
+                if args.verbose:
+                    print("-- mean(Y) = %f, mean(Y_pred) = %f" % (
+                        Y_train.mean(),
+                        y_pred.mean()))
+                mae, tau, auc, f1_score = evaluate_predictions(
+                    y_true=test_values,
+                    y_pred=y_pred,
+                    max_ic50=args.max_ic50)
+
+                cv_fold_field_values = [
+                    allele,
+                    str(fold_idx),
+                    str(n_train_unique),
+                    str(n_train),
+                ]
+                accuracy_field_values = [
+                    "%0.4f" % mae,
+                    "%0.4f" % tau,
+                    "%0.4f" % auc,
+                    "%0.4f" % f1_score
+                ]
+                output_line = (
+                    ",".join(cv_fold_field_values) +
+                    "," + key +
+                    "," + ",".join(accuracy_field_values) +
+                    "\n"
+                )
+                print("CV fold result: %s" % output_line)
+                if output_file:
+                    output_file.write(output_line)
+                    output_file.flush()
diff --git a/experiments/matrix_completion_helpers.py b/experiments/matrix_completion_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb6d803e7a509e6582d78822536c522b0a753e6a
--- /dev/null
+++ b/experiments/matrix_completion_helpers.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from fancyimpute.dictionary_helpers import (
+    dense_matrix_from_nested_dictionary,
+    transpose_nested_dictionary,
+)
+from mhcflurry.data import load_allele_dicts
+import sklearn.metrics
+from sklearn.cross_validation import StratifiedKFold
+from scipy import stats
+import numpy as np
+
+
+def evaluate_predictions(
+        y_true,
+        y_pred,
+        max_ic50):
+    """
+    Return mean absolute error, Kendall tau, AUC and F1 score
+    """
+    if len(y_pred) != len(y_true):
+        raise ValueError("y_pred must have same number of elements as y_true")
+
+    mae = np.mean(np.abs(y_true - y_pred))
+
+    # if all predictions are the same
+    if (y_pred[0] == y_pred).all():
+        logging.warn("All predicted values were the same: %f" % y_pred[0])
+        return (mae, 0, 0.5, 0)
+
+    tau, _ = stats.kendalltau(
+        y_pred,
+        y_true)
+    if np.isnan(tau):
+        logging.warn("Value for Kendall's tau was NaN (n=%d)" % (len(y_true),))
+        tau = 0.0
+
+    true_ic50s = max_ic50 ** (1.0 - np.array(y_true))
+    predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred))
+
+    true_binding_label = true_ic50s <= 500
+    if true_binding_label.all():
+        logging.warn("All true labels are binders, can't compute AUC")
+        return (mae, tau, 0.5, 0)
+    elif not true_binding_label.any():
+        logging.warn("No true labels are binders, can't compute AUC")
+        # can't compute AUC or F1 without both negative and positive cases
+        return (mae, tau, 0.5, 0)
+
+    auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred)
+
+    predicted_binding_label = predicted_ic50s <= 500
+    if predicted_binding_label.all():
+        logging.warn("All predicted labels are binders, can't compute F1")
+        return (mae, tau, auc, 0)
+    elif not predicted_binding_label.any():
+        logging.warn("No predicted labels are binders, can't compute F1")
+        # can't compute F1 without both positive and negative predictions
+        return (mae, tau, auc, 0)
+
+    f1_score = sklearn.metrics.f1_score(
+        true_binding_label,
+        predicted_binding_label)
+
+    return mae, tau, auc, f1_score
+
+
+def prune_data(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1):
+    observed_mask = np.isfinite(X)
+    n_observed_per_peptide = observed_mask.sum(axis=1)
+    too_few_peptide_observations = (
+        n_observed_per_peptide < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_peptide_indices = np.where(too_few_peptide_observations)[0]
+        keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
+        print("Dropping %d peptides with <%d observations" % (
+            len(drop_peptide_indices),
+            min_observations_per_peptide))
+        X = X[keep_peptide_indices]
+        observed_mask = observed_mask[keep_peptide_indices]
+        peptide_list = [peptide_list[i] for i in keep_peptide_indices]
+
+    n_observed_per_allele = observed_mask.sum(axis=0)
+    too_few_allele_observations = (
+        n_observed_per_allele < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_allele_indices = np.where(too_few_allele_observations)[0]
+        keep_allele_indices = np.where(~too_few_allele_observations)[0]
+
+        print("Dropping %d alleles with <%d observations: %s" % (
+            len(drop_allele_indices),
+            min_observations_per_allele,
+            [allele_list[i] for i in drop_allele_indices]))
+        X = X[:, keep_allele_indices]
+        observed_mask = observed_mask[:, keep_allele_indices]
+        allele_list = [allele_list[i] for i in keep_allele_indices]
+    return X, peptide_list, allele_list
+
+
+def load_data(
+        binding_data_csv,
+        max_ic50,
+        only_human=False,
+        min_observations_per_allele=1,
+        min_observations_per_peptide=1):
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        binding_data_csv,
+        max_ic50=max_ic50,
+        only_human=only_human,
+        regression_output=True)
+    peptide_to_allele_to_affinity = transpose_nested_dictionary(
+        allele_to_peptide_to_affinity)
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_to_peptide_to_affinity.values()
+    )
+    print("Loaded %d binding values for %d alleles" % (
+        n_binding_values,
+        len(allele_to_peptide_to_affinity)))
+    X, peptide_list, allele_list = \
+        dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
+    return prune_data(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=min_observations_per_peptide,
+        min_observations_per_allele=min_observations_per_allele)
+
+
+def index_counts(indices):
+    max_index = indices.max()
+    counts = np.zeros(max_index + 1, dtype=int)
+    for index in indices:
+        counts[index] += 1
+    return counts
+
+
+def stratified_cross_validation(X, observed_mask, n_folds=10):
+    n_observed = observed_mask.sum()
+
+    (observed_peptide_index, observed_allele_index) = np.where(observed_mask)
+    observed_indices = np.ravel_multi_index(
+        (observed_peptide_index, observed_allele_index),
+        dims=observed_mask.shape)
+
+    assert len(observed_indices) == n_observed
+
+    observed_allele_counts = observed_mask.sum(axis=0)
+    print("# observed per allele: %s" % (observed_allele_counts,))
+    assert (index_counts(observed_allele_index) == observed_allele_counts).all()
+
+    kfold = StratifiedKFold(
+        observed_allele_index,
+        n_folds=n_folds,
+        shuffle=True)
+
+    for (_, indirect_test_indices) in kfold:
+        test_linear_indices = observed_indices[indirect_test_indices]
+        test_coords = np.unravel_index(
+            test_linear_indices,
+            dims=observed_mask.shape)
+
+        test_allele_counts = index_counts(test_coords[1])
+        allele_fractions = test_allele_counts / observed_allele_counts.astype(float)
+        print("Fraction of each allele in this CV fold: %s" % (allele_fractions,))
+
+        X_test_vector = X[test_coords]
+
+        X_fold = X.copy()
+        X_fold[test_coords] = np.nan
+
+        empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0
+        ok_row_mask = ~empty_row_mask
+        ok_row_indices = np.where(ok_row_mask)[0]
+
+        empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0
+        ok_col_mask = ~empty_col_mask
+        ok_col_indices = np.where(ok_col_mask)[0]
+
+        ok_mesh = np.ix_(ok_row_indices, ok_col_indices)
+
+        print("Dropping %d empty rows, %d empty columns" % (
+            empty_row_mask.sum(),
+            empty_col_mask.sum()))
+        yield (X_fold, ok_mesh, test_coords, X_test_vector)
diff --git a/experiments/model-selection.py b/experiments/model-selection.py
index c46d735d1255bce7d2c98fdc113e71473dd26632..df52f7e30918d7ca3f88e2fb64cf60ecf1419946 100755
--- a/experiments/model-selection.py
+++ b/experiments/model-selection.py
@@ -143,6 +143,8 @@ parser.add_argument(
     help="Comma separated list of optimization methods")
 
 
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     configs = generate_all_model_configs(
diff --git a/experiments/synthesize-augmented-dataset.py b/experiments/synthesize-augmented-dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..10cf705f868415901a53c66ecc4bc6a42f4f86a4
--- /dev/null
+++ b/experiments/synthesize-augmented-dataset.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+import argparse
+
+import pandas as pd
+
+from dataset_paths import PETERS2009_CSV_PATH
+from allele_similarities import compute_allele_similarities
+from synthetic_data import (
+    create_reverse_lookup_from_simple_dicts,
+    synthesize_affinities_for_all_alleles,
+    load_sims_dict
+)
+
+from mhcflurry.data import load_allele_dicts
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--output-csv",
+    help="Path for CSV containing synthesized dataset of affinities",
+    required=True)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--allele-similarity-csv",
+    required=False)
+
+parser.add_argument(
+    "--smoothing-coef",
+    default=0.001,
+    type=float,
+    help="Smoothing value used for peptides with low weight across alleles")
+
+parser.add_argument(
+    "--similarity-exponent",
+    default=2.0,
+    type=float,
+    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        regression_output=True)
+
+    print("Loaded binding data for %d alleles" % (
+        len(allele_to_peptide_to_affinity),))
+
+    reverse_lookup = create_reverse_lookup_from_simple_dicts(
+        allele_to_peptide_to_affinity)
+    print("Created reverse lookup dictionary for %d peptides" % len(reverse_lookup))
+
+    if args.allele_similarity_csv:
+        sims_dict = load_sims_dict(
+            args.allele_similarity_csv,
+            allele_pair_keys=False)
+        allele_names = list(sims_dict.keys())
+        print("Loaded similarities between %d alleles from %s" % (
+            len(allele_names),
+            args.allele_similarity_csv))
+    else:
+        sims_dict, _, _ = \
+            compute_allele_similarities(
+                allele_to_peptide_to_affinity,
+                min_weight=0.1)
+
+    synthetic_data = synthesize_affinities_for_all_alleles(
+        peptide_to_affinities=reverse_lookup,
+        pairwise_allele_similarities=sims_dict,
+        smoothing=args.smoothing_coef,
+        exponent=args.similarity_exponent,
+        allele_pair_keys=False)
+
+    synthetic_data_dict = OrderedDict([
+        ("mhc", []),
+        ("sequence", []),
+        ("ic50", []),
+    ])
+
+    for allele, allele_entries in sorted(synthetic_data.items()):
+        print("%s (n=%d)" % (allele, len(allele_entries)))
+        for (peptide, regression_value) in allele_entries.items():
+            synthetic_data_dict["mhc"].append(allele)
+            synthetic_data_dict["sequence"].append(peptide)
+            ic50 = args.max_ic50 ** (1.0 - regression_value)
+            synthetic_data_dict["ic50"].append(ic50)
+
+    synthetic_data_df = pd.DataFrame(synthetic_data_dict)
+
+    print("Created dataset with %d synthesized pMHC affinities" % (
+        len(synthetic_data_df),))
+
+    synthetic_data_df.to_csv(args.output_csv, index=False)
diff --git a/experiments/synthetic_data.py b/experiments/synthetic_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..130b8e2ff73fd0642871aedc9a6c31e156c0ceea
--- /dev/null
+++ b/experiments/synthetic_data.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper functions for creating synthetic pMHC affinities from allele similarities
+and true binding data
+"""
+
+from collections import defaultdict
+
+import pandas as pd
+
+from common import curry_dictionary
+
+
+def load_sims_dict(csv_path, allele_pair_keys=True):
+    """
+    Read in allele-to-allele similarities as DataFrame
+    """
+    sims_df = pd.read_csv(csv_path)
+    # extract dictionary mapping pairs of alleles to similarity
+    sims_dict = {
+        (row["allele_A"], row["allele_B"]): row["similarity"]
+        for (_, row)
+        in sims_df.iterrows()
+    }
+    if not allele_pair_keys:
+        return curry_dictionary(sims_dict)
+    else:
+        return sims_dict
+
+
+def synthesize_affinities_for_single_allele(
+        similarities,
+        peptide_to_affinities,
+        smoothing=0.005,
+        exponent=2.0,
+        exclude_alleles=[]):
+    """
+    Parameters
+    ----------
+    similarities : dict
+        Dictionary mapping allele names to their similarity to the
+        allele of interest.
+
+    peptide_to_affinities : dict
+        Dictionary mapping peptide sequence to list of triplets containing
+        the following fields:
+            - allele
+            - log-scaled affinity normalized to 0.0 = no binding, 1.0 = strong
+            - sample weight
+
+    smoothing : float
+
+    exponent : float
+
+    exclude_alleles : collection of allele names
+        Don't use these alleles for synthesizing new affinities
+
+    Returns dictionary mapping peptide sequences to affinities.
+    """
+    assert isinstance(similarities, dict), \
+        "Wrong type for similarities: %s" % type(similarities)
+    results = {}
+    for peptide, affinities in peptide_to_affinities.items():
+        total = 0.0
+        denom = 0.0
+        for entry in affinities:
+
+            if len(entry) == 2:
+                (allele, y) = entry
+                sample_weight = 1.0
+            else:
+                assert len(entry) == 3
+                (allele, y, sample_weight) = entry
+            if allele in exclude_alleles:
+                continue
+            sim = similarities.get(allele, 0)
+            if sim == 0:
+                continue
+            combined_weight = sim ** exponent * sample_weight
+            total += combined_weight * y
+            denom += combined_weight
+        if denom > 0.0:
+            results[peptide] = total / (smoothing + denom)
+    return results
+
+
+def create_reverse_lookup_from_allele_data_objects(allele_datasets):
+    """
+    Given a dictionary mapping each allele name to an AlleleData object,
+    create reverse-lookup dictionary mapping each peptide to a list of triplets:
+        [(allele, regression_output, weight), ...]
+    """
+    peptide_affinities_dict = defaultdict(list)
+    for allele, dataset in allele_datasets.items():
+        for peptide, y, weight in zip(
+                dataset.peptides, dataset.Y, dataset.weights):
+            entry = (allele, y, weight)
+            peptide_affinities_dict[peptide].append(entry)
+    return peptide_affinities_dict
+
+
+def create_reverse_lookup_from_simple_dicts(affinities):
+    """
+    Create a lookup table from peptides to lists of (allele, affinity, weight)
+    """
+    reverse_lookup = defaultdict(list)
+    for allele, affinity_dict in affinities.items():
+        for (peptide, affinity) in affinity_dict.items():
+            reverse_lookup[peptide].append((allele, affinity, 1.0))
+    return reverse_lookup
+
+
+def synthesize_affinities_for_all_alleles(
+        peptide_to_affinities,
+        pairwise_allele_similarities,
+        allele_pair_keys=True,
+        smoothing=0.005,
+        exponent=2.0):
+    """
+    peptide_to_affinities : dict
+        Maps each peptide to list of either
+        (allele, affinity) or (allele, affinity, weight)
+
+    pairwise_allele_similarities : dict
+        Dictionary from allele -> allele -> value between 0..1
+
+    smoothing : float
+
+    exponent : float
+    """
+    if allele_pair_keys:
+        pairwise_allele_similarities = curry_dictionary(
+            pairwise_allele_similarities)
+
+    all_predictions = {}
+
+    allele_names = set(pairwise_allele_similarities.keys())
+    for allele in allele_names:
+        all_predictions[allele] = synthesize_affinities_for_single_allele(
+            similarities=pairwise_allele_similarities[allele],
+            peptide_to_affinities=peptide_to_affinities,
+            smoothing=smoothing,
+            exponent=exponent)
+    return all_predictions
diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d0d050e7b906b47488e32c12aea29a12125ac0c
--- /dev/null
+++ b/experiments/train-models-with-synthetic-pretraining.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import numpy as np
+
+import mhcflurry
+from mhcflurry.data import (
+    load_allele_datasets,
+    encode_peptide_to_affinity_dict,
+)
+
+
+from arg_parsing import parse_int_list, parse_float_list
+from dataset_paths import PETERS2009_CSV_PATH
+from common import load_csv_binding_data_as_dict
+from training_helpers import (
+    kfold_cross_validation_of_model_params_with_synthetic_data,
+    average_prediction_scores_list
+)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--synthetic-data-csv",
+    required=True,
+    help="CSV with {mhc, sequence, ic50} columns of synthetic data")
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--embedding-dim-sizes",
+    default=[4, 8, 16, 32, 64],
+    type=parse_int_list)
+
+parser.add_argument(
+    "--hidden-layer-sizes",
+    default=[4, 8, 16, 32, 64, 128, 256],
+    type=parse_int_list)
+
+parser.add_argument(
+    "--dropouts",
+    default=[0.0, 0.25, 0.5],
+    type=parse_float_list)
+
+parser.add_argument(
+    "--activation-functions",
+    default=["tanh", "relu"],
+    type=lambda s: [si.strip() for si in s.split(",")])
+
+parser.add_argument(
+    "--training-epochs",
+    type=int,
+    default=250)
+
+parser.add_argument("--log-file")
+
+
+def get_extra_data(allele, train_datasets, expanded_predictions):
+    original_dataset = train_datasets[allele]
+    original_peptides = set(original_dataset.peptides)
+    expanded_allele_affinities = expanded_predictions[allele]
+    extra_affinities = {
+        k: v
+        for (k, v) in expanded_allele_affinities.items()
+        if k not in original_peptides
+    }
+    extra_peptides = list(extra_affinities.keys())
+    extra_values = list(extra_affinities.values())
+    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
+    extra_Y = np.array(extra_values)
+    return extra_X, extra_Y
+
+
+def rescale_ic50(ic50, max_ic50):
+    log_ic50 = np.log(ic50) / np.log(args.max_ic50)
+    return max(0.0, min(1.0, 1.0 - log_ic50))
+
+
+def load_synthetic_data(csv_path, max_ic50):
+    synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
+        csv_path)
+    return {
+        allele: {
+            peptide: rescale_ic50(ic50, max_ic50=max_ic50)
+            for (peptide, ic50)
+            in allele_dict.items()
+        }
+        for (allele, allele_dict)
+        in synthetic_allele_to_peptide_to_ic50_dict.items()
+    }
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    print("Loading binding data from %s" % args.binding_data_csv)
+    allele_datasets = load_allele_datasets(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=False)
+    print("Loading synthetic data from %s" % args.synthetic_data_csv)
+
+    synthetic_affinities = load_synthetic_data(
+        csv_path=args.synthetic_data_csv,
+        max_ic50=args.max_ic50)
+
+    combined_allele_set = set(allele_datasets.keys()).union(
+        synthetic_affinities.keys())
+
+    combined_allele_list = list(sorted(combined_allele_set))
+
+    if args.log_file:
+        logfile = open(args.log_file, "w")
+        logfile.write("allele,n_samples,n_unique,n_synth,")
+    else:
+        logfile = None
+    logfile_needs_header = True
+
+    for allele in combined_allele_list:
+        synthetic_allele_dict = synthetic_affinities[allele]
+        (_, _, Counts_synth, X_synth, _, Y_synth) = \
+            encode_peptide_to_affinity_dict(synthetic_allele_dict)
+        synthetic_sample_weights = 1.0 / Counts_synth
+        scores = {}
+        source_peptides = allele_datasets[allele].original_peptides
+        X_original = allele_datasets[allele].X_index
+        Y_original = allele_datasets[allele].Y
+        n_samples = len(X_original)
+        n_unique_samples = len(set(source_peptides))
+        n_synth_samples = len(synthetic_sample_weights)
+        for dropout in args.dropouts:
+            for embedding_dim_size in args.embedding_dim_sizes:
+                for hidden_layer_size in args.hidden_layer_sizes:
+                    for activation in args.activation_functions:
+                        params = (
+                            ("dropout_probability", dropout),
+                            ("embedding_dim_size", embedding_dim_size),
+                            ("hidden_layer_size", hidden_layer_size),
+                            ("activation", activation),
+                        )
+
+                        print(
+                            "Evaluating allele %s (n=%d, unique=%d): %s" % (
+                                allele,
+                                n_samples,
+                                n_unique_samples,
+                                params))
+                        fold_scores = \
+                            kfold_cross_validation_of_model_params_with_synthetic_data(
+                                X_original=X_original,
+                                Y_original=Y_original,
+                                source_peptides_original=source_peptides,
+                                X_synth=X_synth,
+                                Y_synth=Y_synth,
+                                original_sample_weights=allele_datasets[allele].weights,
+                                synthetic_sample_weights=synthetic_sample_weights,
+                                n_training_epochs=args.training_epochs,
+                                max_ic50=args.max_ic50,
+                                **dict(params))
+                        average_scores = average_prediction_scores_list(fold_scores)
+                        if logfile:
+                            if logfile_needs_header:
+                                for param_name, _ in params:
+                                    logfile.write("%s," % param_name)
+                                for score_name in average_scores._fields:
+                                    logfile.write("%s," % score_name)
+                                logfile.write("param_id\n")
+                                logfile_needs_header = False
+                            logfile.write("%s,%d,%d,%d," % (
+                                allele,
+                                n_samples,
+                                n_unique_samples,
+                                n_synth_samples))
+                            for _, param_value in params:
+                                logfile.write("%s," % param_value)
+                            for score_name in average_scores._fields:
+                                score_value = average_scores.__dict__[score_name]
+                                logfile.write("%0.4f," % score_value)
+                            logfile.write("%d\n" % len(scores))
+                            logfile.flush()
+
+                        scores[params] = average_scores
+                        print("%s => %s" % (params, average_scores))
+    if logfile:
+        logfile.close()
diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py
index 0b9396247a7d4988084ba442c04e8da56cafff11..17773cdd9c7428ff5dc1370c7698c49b37b6bfc1 100644
--- a/experiments/training_helpers.py
+++ b/experiments/training_helpers.py
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
+import logging
+from itertools import groupby
 
 import numpy as np
-from sklearn.metrics import roc_auc_score
+from sklearn import metrics
+from sklearn.cross_validation import LabelKFold
+from scipy import stats
 
+import mhcflurry
 from mhcflurry.common import normalize_allele_name
-from mhcflurry.data_helpers import indices_to_hotshot_encoding
+from mhcflurry.data import indices_to_hotshot_encoding
+
+
+PredictionScores = namedtuple("PredictionScores", "tau auc f1 accuracy mae")
 
 
 def encode_allele_dataset(
@@ -81,27 +89,54 @@ def encode_allele_datasets(
     return (X_dict, Y_log_ic50_dict, ic50_dict)
 
 
-def f1_score(true_label, label_pred):
-    tp = (true_label & label_pred).sum()
-    fp = ((~true_label) & label_pred).sum()
-    fn = (true_label & (~label_pred)).sum()
-    recall = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0
-    precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0
-    if (precision + recall) > 0:
-        return (2 * precision * recall) / (precision + recall)
-    else:
-        return 0.0
-
-
-def score_predictions(predicted_log_ic50, true_label, max_ic50):
-    """Computes accuracy, AUC, and F1 score of predictions"""
-    auc = roc_auc_score(true_label, predicted_log_ic50)
-    ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50)
-    label_pred = (ic50_pred <= 500)
-    same_mask = true_label == label_pred
+def score_predictions(
+        predicted_log_ic50,
+        true_log_ic50,
+        max_ic50):
+    """Computes Kendall-tau, AUC, F1 score, and accuracy of predictions
+
+    Parameters
+    ----------
+    predicted_log_ic50 : np.array
+
+    true_log_ic50 : np.array
+
+    max_ic50 : float
+
+    Returns PredictionScores object with fields (tau, auc, f1, accuracy)
+    """
+    mae = np.abs(predicted_log_ic50 - true_log_ic50).mean()
+    tau, _ = stats.kendalltau(predicted_log_ic50, true_log_ic50)
+    if np.isnan(tau):
+        logging.warn("Kendall tau was NaN!")
+        tau = 0.0
+
+    true_ic50s = max_ic50 ** (1.0 - np.array(true_log_ic50))
+    predicted_ic50s = max_ic50 ** (1.0 - np.array(predicted_log_ic50))
+
+    true_binding_label = true_ic50s <= 500
+    if true_binding_label.all() or not true_binding_label.any():
+        logging.warn(
+            ("Can't compute AUC, F1, accuracy without both"
+             " negative and positive ground truth labels"))
+        return PredictionScores(
+            tau=tau,
+            auc=0.5,
+            f1=0.0,
+            accuracy=0.0,
+            mae=mae)
+
+    auc = metrics.roc_auc_score(true_binding_label, predicted_log_ic50)
+    predicted_binding_label = predicted_ic50s <= 500
+    f1_score = metrics.f1_score(true_binding_label, predicted_binding_label)
+    same_mask = true_binding_label == predicted_binding_label
     accuracy = np.mean(same_mask)
-    f1 = f1_score(true_label, label_pred)
-    return accuracy, auc, f1
+    return PredictionScores(
+        tau=tau,
+        auc=auc,
+        f1=f1_score,
+        accuracy=accuracy,
+        mae=mae)
 
 
 def train_model_and_return_scores(
@@ -109,7 +144,7 @@ def train_model_and_return_scores(
         X_train,
         log_ic50_train,
         X_test,
-        binder_label_test,
+        log_ic50_test,
         n_training_epochs,
         minibatch_size,
         max_ic50):
@@ -120,8 +155,339 @@ def train_model_and_return_scores(
         verbose=0,
         batch_size=minibatch_size)
     pred = model.predict(X_test).flatten()
-    accuracy, auc, f1_score = score_predictions(
+    return score_predictions(
         predicted_log_ic50=pred,
-        true_label=binder_label_test,
+        true_log_ic50=log_ic50_test,
         max_ic50=max_ic50)
-    return (accuracy, auc, f1_score)
+
+
+def train_model_with_synthetic_data(
+        model,
+        n_training_epochs,
+        X_original,
+        Y_original,
+        X_synth,
+        Y_synth,
+        original_sample_weights=None,
+        synthetic_sample_weights=None):
+    n_actual_samples, n_actual_dims = X_original.shape
+    n_synth_samples, n_synth_dims = X_synth.shape
+    if original_sample_weights is None:
+        original_sample_weights = np.ones_like(Y_original)
+
+    if synthetic_sample_weights is None:
+        synthetic_sample_weights = np.ones_like(Y_synth)
+
+    total_synth_weights = synthetic_sample_weights.sum()
+    total_original_weights = original_sample_weights.sum()
+    print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % (
+        np.mean(Y_original),
+        np.mean(Y_synth),
+        np.mean(original_sample_weights),
+        np.mean(synthetic_sample_weights)))
+
+    combined_weights = np.concatenate([
+        original_sample_weights,
+        synthetic_sample_weights
+    ])
+
+    assert n_actual_dims == n_synth_dims, \
+        "Mismatch between # of actual dims %d and synthetic dims %d" % (
+            n_actual_dims, n_synth_dims)
+
+    X_combined = np.vstack([X_original, X_synth])
+    n_combined_samples = n_actual_samples + n_synth_samples
+
+    assert X_combined.shape[0] == n_combined_samples, \
+        "Expected %d samples but got data array with shape %s" % (
+            n_actual_samples + n_synth_samples, X_combined.shape)
+
+    assert len(combined_weights) == n_combined_samples, \
+        "Expected combined_weights to have length %d but got shape = %s" % (
+            n_combined_samples,
+            combined_weights.shape)
+
+    Y_combined = np.concatenate([Y_original, Y_synth])
+    assert Y_combined.min() >= 0, \
+        "Y should not contain negative numbers! Y.min() = %f" % (
+            Y_combined.min(),)
+    assert Y_combined.max() <= 1, \
+        "Y should have max value 1.0, got Y.max() = %f" % (
+            Y_combined.max(),)
+
+    for epoch in range(n_training_epochs):
+        # weights for synthetic points can be shrunk as:
+        #  ~ 1 / (1+epoch)**2
+        # or
+        # e ** -epoch
+        decay_factor = np.exp(-epoch)
+        # if the contribution of synthetic samples is less than a
+        # thousandth of the actual data, then stop using it
+        synth_contribution = total_synth_weights * decay_factor
+        synth_fraction_contribution = synth_contribution / (
+            synth_contribution + total_original_weights)
+        # only use synthetic data if it contributes at least 1/1000th of
+        # sample weight
+        use_synth_data = synth_fraction_contribution > 0.001
+        if use_synth_data:
+            combined_weights[n_actual_samples:] = (
+                synthetic_sample_weights * decay_factor)
+            model.fit(
+                X_combined,
+                Y_combined,
+                sample_weight=combined_weights,
+                nb_epoch=1,
+                verbose=0)
+        else:
+            model.fit(
+                X_original,
+                Y_original,
+                sample_weight=original_sample_weights,
+                nb_epoch=1,
+                verbose=0)
+
+        Y_pred = model.predict(X_original)
+        training_mse = ((Y_original - Y_pred) ** 2).mean()
+
+        synth_data_percent = (
+            ((1.0 - synth_fraction_contribution) * 100)
+            if use_synth_data
+            else 100
+        )
+        real_data_percent = (
+            (synth_fraction_contribution * 100)
+            if use_synth_data else 0
+        )
+        print(
+            ("-- Epoch %d/%d real data weight %0.2f%%,"
+             " synth data weight %0.2f%%,"
+             " Training MSE %0.4f") % (
+                epoch + 1,
+                n_training_epochs,
+                real_data_percent,
+                synth_data_percent,
+                training_mse))
+
+
+def peptide_group_indices(original_peptide_sequences):
+    """
+    Given a list of peptide sequences, some of which are identical,
+    generate a sequence of index lists for the occurrence of distinct
+    entries.
+    """
+    indices_and_sequences = enumerate(original_peptide_sequences)
+    for _, subiter in groupby(indices_and_sequences, lambda pair: pair[1]):
+        yield [idx for (idx, _) in subiter]
+
+
+def collapse_peptide_group_affinities(
+        predictions,
+        true_values,
+        original_peptide_sequences,
+        combine_multiple_predictions_fn=np.median):
+    """
+    Given predictions and true log-transformed IC50 values for 9mers which may
+    have been either elongated or shortened from peptide sequences of other
+    lengths, collapse the log-transformed IC50 values to a single value
+    per original peptide sequence.
+    """
+    assert len(predictions) == len(true_values), \
+        ("Expected predictions (%d elements) to have same length"
+         " as true values (%d elements)") % (len(predictions), len(true_values))
+    assert len(predictions) == len(original_peptide_sequences), \
+        ("Expected predictions (%d elements) to have same length"
+         " as peptide sequences (%d elements)") % (
+            len(predictions), len(original_peptide_sequences))
+
+    collapsed_predictions = []
+    collapsed_true_values = []
+    for peptide_indices in peptide_group_indices(original_peptide_sequences):
+        if len(peptide_indices) == 1:
+            idx = peptide_indices[0]
+            collapsed_predictions.append(predictions[idx])
+            collapsed_true_values.append(true_values[idx])
+        else:
+            collapsed_predictions.append(
+                combine_multiple_predictions_fn(predictions[peptide_indices]))
+            collapsed_true_values.append(
+                combine_multiple_predictions_fn(
+                    true_values[peptide_indices]))
+    return collapsed_predictions, collapsed_true_values
+
+
+def kfold_cross_validation_of_model_fn_with_synthetic_data(
+        make_model_fn,
+        n_training_epochs,
+        max_ic50,
+        X_train,
+        Y_train,
+        source_peptides_train,
+        X_synth,
+        Y_synth,
+        original_sample_weights,
+        synthetic_sample_weights,
+        n_cross_validation_folds,
+        combine_multiple_predictions_fn=np.median,
+        random_seed=0):
+    """
+    Given a function which generates fresh copies of a model, use k-fold
+    cross validation (stratified by the list of source peptide sequences) to
+    evaluate the predictive performance of this model type.
+
+    Returns a list of pairs, the first element is a trained model and the second
+    element is a PredictionScores object with the following fields:
+        ("tau", "f1", "auc", "accuracy")
+    """
+    assert len(X_train) == len(Y_train)
+    assert len(source_peptides_train) == len(X_train)
+    # randomly shuffle the training data first
+    indices = np.arange(len(X_train))
+    np.random.seed(random_seed)
+    np.random.shuffle(indices)
+    X_train, Y_train, weights_train = \
+        X_train[indices], Y_train[indices], original_sample_weights[indices]
+    source_peptides_train = [source_peptides_train[i] for i in indices]
+
+    # we need to do cross-validation carefully since some of our training
+    # samples may be extracted from the same longer peptide.
+    # To avoid training vs. test contamination we do k-fold cross validation
+    # stratified by the "source" peptides of each sample.
+    cv_iterator = enumerate(
+        LabelKFold(
+            labels=source_peptides_train,
+            n_folds=n_cross_validation_folds))
+    results = []
+    for fold_number, (fold_train_index, fold_test_index) in cv_iterator:
+        model = make_model_fn()
+        X_train_fold, X_test_fold = \
+            X_train[fold_train_index], X_train[fold_test_index]
+        weights_train_fold = weights_train[fold_train_index]
+        Y_train_fold, Y_test_fold = \
+            Y_train[fold_train_index], Y_train[fold_test_index]
+        peptides_train_fold = [
+            source_peptides_train[i]
+            for i in fold_train_index
+        ]
+        peptides_test_fold = [
+            source_peptides_train[i]
+            for i in fold_test_index
+        ]
+
+        train_model_with_synthetic_data(
+            model=model,
+            n_training_epochs=n_training_epochs,
+            X_original=X_train_fold,
+            Y_original=Y_train_fold,
+            X_synth=X_synth,
+            Y_synth=Y_synth,
+            original_sample_weights=weights_train_fold,
+            synthetic_sample_weights=synthetic_sample_weights)
+        Y_pred = model.predict(X_test_fold).flatten()
+        # since some 'epitopes' are actually substrings of longer peptides
+        # we need to coalesce those predictions by calling the function
+        # `combine_multiple_predictions_fn` on the set of predictions
+        # associated with one longer peptide
+        collapsed_predictions, collapsed_true_values = \
+            collapse_peptide_group_affinities(
+                predictions=Y_pred,
+                true_values=Y_test_fold,
+                original_peptide_sequences=peptides_test_fold,
+                combine_multiple_predictions_fn=combine_multiple_predictions_fn)
+        scores = score_predictions(
+            predicted_log_ic50=collapsed_predictions,
+            true_log_ic50=collapsed_true_values,
+            max_ic50=max_ic50)
+        print("::: CV fold %d/%d (n_samples=%d, n_unique=%d): %s\n\n" % (
+            fold_number + 1,
+            n_cross_validation_folds,
+            len(peptides_train_fold),
+            len(set(peptides_train_fold)),
+            scores))
+        results.append((model, scores))
+    return results
+
+
+def average_prediction_scores_list(model_scores_list):
+    """
+    Given a list of (model, PredictionScores) pairs,
+    returns a single PredictionScores object whose fields are
+    the average across the list.
+    """
+    n = len(model_scores_list)
+    return PredictionScores(
+        tau=sum(x.tau for (_, x) in model_scores_list) / n,
+        auc=sum(x.auc for (_, x) in model_scores_list) / n,
+        f1=sum(x.f1 for (_, x) in model_scores_list) / n,
+        accuracy=sum(x.accuracy for (_, x) in model_scores_list) / n)
+
+
+def kfold_cross_validation_of_model_params_with_synthetic_data(
+        X_original,
+        Y_original,
+        source_peptides_original,
+        X_synth=None,
+        Y_synth=None,
+        original_sample_weights=None,
+        synthetic_sample_weights=None,
+        n_training_epochs=150,
+        n_cross_validation_folds=5,
+        embedding_dim_size=16,
+        hidden_layer_size=50,
+        dropout_probability=0.0,
+        activation="tanh",
+        max_ic50=50000.0):
+    """
+    Returns:
+        - PredictionScores object with average Kendall-tau, AUC, F1,
+        and accuracy across all cross-validation folds.
+        - list of all (model, PredictionScores) objects from cross-validation
+        folds
+    """
+    n_unique_samples = len(set(source_peptides_original))
+
+    if n_cross_validation_folds > n_unique_samples:
+        n_cross_validation_folds = n_unique_samples
+
+    if original_sample_weights is None:
+        original_sample_weights = np.ones(len(X_original), dtype=float)
+
+    # if synthetic data is missing then make an empty array so all the
+    # downstream code still works
+    if X_synth is None:
+        X_synth = np.array([[]])
+
+    # if only the target values for synthetic data are missing then
+    # treat all synthetic entries as negative examples
+    if Y_synth is None:
+        Y_synth = np.zeros(len(X_synth), dtype=float)
+
+    if synthetic_sample_weights is None:
+        synthetic_sample_weights = np.ones(len(X_synth), dtype=float)
+
+    def make_model():
+        return mhcflurry.feedforward.make_embedding_network(
+            peptide_length=9,
+            embedding_input_dim=20,
+            embedding_output_dim=embedding_dim_size,
+            layer_sizes=[hidden_layer_size],
+            activation=activation,
+            init="lecun_uniform",
+            loss="mse",
+            output_activation="sigmoid",
+            dropout_probability=dropout_probability,
+            optimizer=None,
+            learning_rate=0.001)
+
+    model_scores_list = kfold_cross_validation_of_model_fn_with_synthetic_data(
+        make_model_fn=make_model,
+        n_training_epochs=n_training_epochs,
+        max_ic50=max_ic50,
+        X_train=X_original,
+        Y_train=Y_original,
+        source_peptides_train=source_peptides_original,
+        X_synth=X_synth,
+        Y_synth=Y_synth,
+        original_sample_weights=original_sample_weights,
+        synthetic_sample_weights=synthetic_sample_weights,
+        n_cross_validation_folds=n_cross_validation_folds)
+    return model_scores_list
diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index c1a908084f56c3d68d61ac181d2bc2d0ac642b35..4ab1d343a2cb6462e2e50506c00e727a731fd75f 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -13,15 +13,19 @@
 # limitations under the License.
 
 from . import paths
-from . import data_helpers
+from . import data
 from . import feedforward
 from . import common
-from .mhc1_binding_predictor import Mhc1BindingPredictor
+from . import peptide_encoding
+from . import amino_acid
+from .class1_binding_predictor import Class1BindingPredictor
 
 __all__ = [
     "paths",
-    "data_helpers",
+    "data",
     "feedforward",
+    "peptide_encoding",
+    "amino_acid",
     "common",
-    "Mhc1BindingPredictor"
-]
\ No newline at end of file
+    "Class1BindingPredictor"
+]
diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index c7607d4d712ff1929056e99836ef2aa06696a44e..f40225612f6373c6609b3aed60fa23de14c2c90b 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -12,7 +12,81 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-amino_acids = {
+import numpy as np
+
+
+class Alphabet(object):
+    """
+    Used to track the order of amino acids used for peptide encodings
+    """
+
+    def __init__(self, **kwargs):
+        self.letters_to_names = {}
+        for (k, v) in kwargs.items():
+            self.add(k, v)
+
+    def add(self, letter, name):
+        assert letter not in self.letters_to_names
+        assert len(letter) == 1
+        self.letters_to_names[letter] = name
+
+    def letters(self):
+        return list(sorted(self.letters_to_names.keys()))
+
+    def names(self):
+        return [self.letters_to_names[k] for k in self.letters()]
+
+    def index_dict(self):
+        return {c: i for (i, c) in enumerate(self.letters())}
+
+    def copy(self):
+        return Alphabet(**self.letters_to_names)
+
+    def __getitem__(self, k):
+        return self.letters_to_names[k]
+
+    def __setitem__(self, k, v):
+        self.add(k, v)
+
+    def index_encoding_list(self, peptides):
+        index_dict = self.index_dict()
+        return [
+            [index_dict[amino_acid] for amino_acid in peptide]
+            for peptide in peptides
+        ]
+
+    def index_encoding(self, peptides, peptide_length):
+        """
+        Encode a set of equal length peptides as a matrix of their
+        amino acid indices.
+        """
+        X = np.zeros((len(peptides), peptide_length), dtype=int)
+        index_dict = self.index_dict()
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                X[i, j] = index_dict[amino_acid]
+        return X
+
+    def hotshot_encoding(
+            self,
+            peptides,
+            peptide_length):
+        """
+        Encode a set of equal length peptides as a binary matrix,
+        where each letter is transformed into a length 20 vector with a single
+        element that is 1 (and the others are 0).
+        """
+        shape = (len(peptides), peptide_length, 20)
+        index_dict = self.index_dict()
+        X = np.zeros(shape, dtype=bool)
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                k = index_dict[amino_acid]
+                X[i, j, k] = 1
+        return X
+
+
+common_amino_acids = Alphabet(**{
     "A": "Alanine",
     "R": "Arginine",
     "N": "Asparagine",
@@ -33,8 +107,9 @@ amino_acids = {
     "W": "Tryptophan",
     "Y": "Tyrosine",
     "V": "Valine",
-}
+})
+common_amino_acid_letters = common_amino_acids.letters()
 
-amino_acid_letters = list(sorted(amino_acids.keys()))
-amino_acid_names = [amino_acids[k] for k in amino_acid_letters]
-amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)}
+amino_acids_with_unknown = common_amino_acids.copy()
+amino_acids_with_unknown.add("X", "Unknown")
+amino_acids_with_unknown_letters = amino_acids_with_unknown.letters()
diff --git a/mhcflurry/class1.py b/mhcflurry/class1.py
deleted file mode 100644
index aa099d624d9c0a77ae461e4bab94a68063209f5c..0000000000000000000000000000000000000000
--- a/mhcflurry/class1.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pandas as pd
-import os
-
-from .paths import CLASS1_MODEL_DIRECTORY
-from .mhc1_binding_predictor import Mhc1BindingPredictor
-
-def predict(alleles, peptides):
-    allele_dataframes = []
-    for allele in alleles:
-        model = Mhc1BindingPredictor(allele=allele)
-        df = model.predict_peptides(peptides)
-        allele_dataframes.append(df)
-    return pd.concat(allele_dataframes)
-
-def supported_alleles():
-    alleles = []
-    for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
-        allele = filename.replace(".hdf", "")
-        if len(allele) < 5:
-            # skipping serotype names like A2 or B7
-            continue
-        allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
-        alleles.append(allele)
-    return alleles
diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad212f9cd4a42f1e62fb0aa05650340df204776
--- /dev/null
+++ b/mhcflurry/class1_binding_predictor.py
@@ -0,0 +1,424 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Allele specific MHC Class I binding affinity predictor
+"""
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+import logging
+from os import listdir, remove
+from os.path import exists, join
+
+import json
+
+import numpy as np
+from keras.models import model_from_config
+
+
+from .common import normalize_allele_name
+from .paths import CLASS1_MODEL_DIRECTORY
+from .feedforward import make_embedding_network
+from .predictor_base import PredictorBase
+
+from .class1_allele_specific_hyperparameters import MAX_IC50
+
+_allele_predictor_cache = {}
+
+class Class1BindingPredictor(PredictorBase):
+    def __init__(
+            self,
+            model,
+            name=None,
+            max_ic50=MAX_IC50,
+            allow_unknown_amino_acids=False,
+            verbose=False):
+        PredictorBase.__init__(
+            self,
+            name=name,
+            max_ic50=max_ic50,
+            allow_unknown_amino_acids=allow_unknown_amino_acids,
+            verbose=verbose)
+        self.name = name
+        self.model = model
+
+    @classmethod
+    def from_disk(
+            cls,
+            model_json_path,
+            weights_hdf_path=None,
+            name=None,
+            max_ic50=MAX_IC50):
+        """
+        Load model from stored JSON representation of network and
+        (optionally) load weights from HDF5 file.
+        """
+        if not exists(model_json_path):
+            raise ValueError("Model file %s (name = %s) not found" % (
+                model_json_path, name,))
+
+        with open(model_json_path, "r") as f:
+            config_dict = json.load(f)
+
+        model = model_from_config(config_dict)
+
+        if weights_hdf_path:
+            if not exists(weights_hdf_path):
+                raise ValueError(
+                    "Missing model weights file %s (name = %s)" % (
+                        weights_hdf_path, name))
+
+            model.load_weights(weights_hdf_path)
+
+        return cls.__init__(
+            model=model,
+            max_ic50=max_ic50,
+            name=name)
+
+    @classmethod
+    def from_hyperparameters(
+            cls,
+            name=None,
+            max_ic50=MAX_IC50,
+            peptide_length=9,
+            embedding_input_dim=20,
+            embedding_output_dim=20,
+            layer_sizes=[50],
+            activation="tanh",
+            init="lecun_uniform",
+            loss="mse",
+            output_activation="sigmoid",
+            dropout_probability=0,
+            learning_rate=0.001,
+            **kwargs):
+        """
+        Create untrained predictor with the given hyperparameters.
+        """
+        model = make_embedding_network(
+            peptide_length=peptide_length,
+            embedding_input_dim=embedding_input_dim,
+            embedding_output_dim=embedding_output_dim,
+            layer_sizes=layer_sizes,
+            activation=activation,
+            init=init,
+            loss=loss,
+            output_activation=output_activation,
+            dropout_probability=dropout_probability,
+            learning_rate=learning_rate)
+        return cls(
+            name=name,
+            max_ic50=max_ic50,
+            model=model,
+            **kwargs)
+
+    def _combine_training_data(
+            self,
+            X,
+            Y,
+            sample_weights,
+            X_pretrain,
+            Y_pretrain,
+            pretrain_sample_weights,
+            verbose=False):
+        """
+        Make sure the shapes of given training and pre-training data
+        conform with each other. Then concatenate the pre-training and the
+        training data.
+
+        Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples)
+        """
+        X = np.asarray(X)
+        Y = np.asarray(Y)
+
+        if verbose:
+            print("X.shape = %s" % (X.shape,))
+
+        n_samples, n_dims = X.shape
+
+        if len(Y) != n_samples:
+            raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % (
+                n_samples, len(Y)))
+
+        if Y.min() < 0:
+            raise ValueError("Minimum value of Y can't be negative, got %f" % (
+                Y.min()))
+        if Y.max() > 1:
+            raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
+                Y.max()))
+
+        if sample_weights is None:
+            sample_weights = np.ones_like(Y)
+        else:
+            sample_weights = np.asarray(sample_weights)
+
+        if len(sample_weights) != n_samples:
+            raise ValueError(
+                "Length of sample_weights (%d) doesn't match number of samples (%d)" % (
+                    len(sample_weights),
+                    n_samples))
+
+        if X_pretrain is None or Y_pretrain is None:
+            X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
+            Y_pretrain = np.empty((0,), dtype=Y.dtype)
+        else:
+            X_pretrain = np.asarray(X_pretrain)
+            Y_pretrain = np.asarray(Y_pretrain)
+
+        if verbose:
+            print("X_pretrain.shape = %s" % (X_pretrain.shape,))
+        n_pretrain_samples, n_pretrain_dims = X_pretrain.shape
+        if n_pretrain_dims != n_dims:
+            raise ValueError(
+                "# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % (
+                    n_pretrain_dims, n_dims))
+
+        if len(Y_pretrain) != n_pretrain_samples:
+            raise ValueError(
+                "length of Y_pretrain (%d) != length of X_pretrain (%d)" % (
+                    len(Y_pretrain),
+                    len(X_pretrain)))
+
+        if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
+            raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
+                Y.min()))
+        if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
+            raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
+                Y.max()))
+
+        if pretrain_sample_weights is None:
+            pretrain_sample_weights = np.ones_like(Y_pretrain)
+        else:
+            pretrain_sample_weights = np.asarray(pretrain_sample_weights)
+        if verbose:
+            print("sample weights mean = %f, pretrain weights mean = %f" % (
+                sample_weights.mean(),
+                pretrain_sample_weights.mean()))
+        X_combined = np.vstack([X_pretrain, X])
+        Y_combined = np.concatenate([Y_pretrain, Y])
+        combined_weights = np.concatenate([
+            pretrain_sample_weights,
+            sample_weights,
+        ])
+        return X_combined, Y_combined, combined_weights, n_pretrain_samples
+
+    def fit(
+            self,
+            X,
+            Y,
+            sample_weights=None,
+            X_pretrain=None,
+            Y_pretrain=None,
+            pretrain_sample_weights=None,
+            n_training_epochs=200,
+            verbose=False,
+            batch_size=128):
+        """
+        Train predictive model from index encoding of fixed length 9mer peptides.
+
+        Parameters
+        ----------
+        X : array
+            Training data with shape (n_samples, n_dims)
+
+        Y : array
+            Training labels with shape (n_samples,)
+
+        sample_weights : array
+            Weight of each training sample with shape (n_samples,)
+
+        X_pretrain : array
+            Extra samples used for soft pretraining of the predictor,
+            should have same number of dimensions as X. During training the weights
+            of these samples will decay after each epoch.
+
+        Y_pretrain : array
+            Labels for extra samples, shape
+
+        pretrain_sample_weights : array
+            Initial weights for the rows of X_pretrain. If not specified then
+            initialized to ones.
+
+        n_training_epochs : int
+
+        verbose : bool
+
+        batch_size : int
+        """
+        X_combined, Y_combined, combined_weights, n_pretrain = \
+            self._combine_training_data(
+                X, Y, sample_weights, X_pretrain, Y_pretrain, pretrain_sample_weights,
+                verbose=verbose)
+
+        total_pretrain_sample_weight = combined_weights[:n_pretrain].sum()
+        total_train_sample_weight = combined_weights[n_pretrain:].sum()
+        total_combined_sample_weight = (
+            total_pretrain_sample_weight + total_train_sample_weight)
+
+        if self.verbose:
+            print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % (
+                total_pretrain_sample_weight,
+                100 * total_pretrain_sample_weight / total_combined_sample_weight,
+                total_train_sample_weight,
+                100 * total_train_sample_weight / total_combined_sample_weight))
+
+        for epoch in range(n_training_epochs):
+            # weights for synthetic points can be shrunk as:
+            #  ~ 1 / (1+epoch)**2
+            # or
+            # e ** -epoch
+            decay_factor = np.exp(-epoch)
+            # if the contribution of synthetic samples is less than a
+            # thousandth of the actual data, then stop using it
+            pretrain_contribution = total_pretrain_sample_weight * decay_factor
+            pretrain_fraction_contribution = (
+                pretrain_contribution / total_combined_sample_weight)
+
+            use_pretrain_data = pretrain_fraction_contribution > 0.001
+
+            # only use synthetic data if it contributes at least 1/1000th of
+            # sample weight
+            if verbose:
+
+                real_data_percent = (
+                    ((1.0 - pretrain_fraction_contribution) * 100)
+                    if use_pretrain_data
+                    else 100
+                )
+                pretrain_data_percent = (
+                    (pretrain_fraction_contribution * 100)
+                    if use_pretrain_data else 0
+                )
+                print(
+                    ("-- Epoch %d/%d decay=%f, real data weight=%0.2f%%,"
+                     " synth data weight=%0.2f%% (use_pretrain=%s)") % (
+                        epoch + 1,
+                        n_training_epochs,
+                        decay_factor,
+                        real_data_percent,
+                        pretrain_data_percent,
+                        use_pretrain_data))
+
+            if use_pretrain_data:
+                combined_weights[:n_pretrain] *= decay_factor
+                self.model.fit(
+                    X_combined,
+                    Y_combined,
+                    sample_weight=combined_weights,
+                    nb_epoch=1,
+                    verbose=0,
+                    batch_size=batch_size)
+            else:
+                self.model.fit(
+                    X_combined[n_pretrain:],
+                    Y_combined[n_pretrain:],
+                    sample_weight=combined_weights[n_pretrain:],
+                    nb_epoch=1,
+                    verbose=0,
+                    batch_size=batch_size)
+
+    def to_disk(self, model_json_path, weights_hdf_path, overwrite=False):
+        if exists(model_json_path) and overwrite:
+            logging.info(
+                "Removing existing model JSON file '%s'" % (
+                    model_json_path,))
+            remove(model_json_path)
+
+        if exists(model_json_path):
+            logging.warn(
+                "Model JSON file '%s' already exists" % (model_json_path,))
+        else:
+            logging.info(
+                "Saving model file %s (name=%s)" % (model_json_path, self.name))
+            with open(model_json_path, "w") as f:
+                f.write(self.model.to_json())
+
+        if exists(weights_hdf_path) and overwrite:
+            logging.info(
+                "Removing existing model weights HDF5 file '%s'" % (
+                    weights_hdf_path,))
+            remove(weights_hdf_path)
+
+        if exists(weights_hdf_path):
+            logging.warn(
+                "Model weights HDF5 file '%s' already exists" % (
+                    weights_hdf_path,))
+        else:
+            logging.info(
+                "Saving model weights HDF5 file %s (name=%s)" % (
+                    weights_hdf_path, self.name))
+            self.model.save_weights(weights_hdf_path)
+
+    @classmethod
+    def from_allele_name(
+            cls,
+            allele_name,
+            model_directory=CLASS1_MODEL_DIRECTORY,
+            max_ic50=MAX_IC50):
+        if not exists(model_directory) or len(listdir(model_directory)) == 0:
+            raise ValueError(
+                "No MHC prediction models found in %s" % (model_directory,))
+
+        allele_name = normalize_allele_name(allele_name)
+        key = (allele_name, model_directory, max_ic50)
+        if key in _allele_predictor_cache:
+            return _allele_predictor_cache[key]
+
+        if not exists(model_directory) or len(listdir(model_directory)) == 0:
+            raise ValueError(
+                "No MHC prediction models found in %s" % (model_directory,))
+
+        model_json_filename = allele_name + ".json"
+        model_json_path = join(model_directory, model_json_filename)
+
+        weights_hdf_filename = allele_name + ".hdf"
+        weights_hdf_path = join(model_directory, weights_hdf_filename)
+
+        predictor = cls.from_disk(
+            model_json_path=model_json_path,
+            weights_hdf_path=weights_hdf_path,
+            name=allele_name,
+            max_ic50=max_ic50)
+
+        _allele_predictor_cache[key] = predictor
+        return predictor
+
+    @classmethod
+    def supported_alleles(cls, model_directory=CLASS1_MODEL_DIRECTORY):
+        alleles_with_weights = set([])
+        alleles_with_models = set([])
+        for filename in listdir(model_directory):
+            if filename.endswith(".hdf"):
+                alleles_with_weights.add(filename.replace(".hdf", ""))
+            elif filename.endswith(".json"):
+                alleles_with_models.add(filename.replace(".json", ""))
+        alleles = alleles_with_models.intersection(alleles_with_weights)
+        return list(sorted(alleles))
+
+    def __repr__(self):
+        return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % (
+            self.name,
+            self.model,
+            self.max_ic50)
+
+    def __str__(self):
+        return repr(self)
+
+    def predict_encoded(self, X):
+        max_expected_index = 20 if self.allow_unknown_amino_acids else 19
+        assert X.max() <= max_expected_index, \
+            "Got index %d in peptide encoding" % (X.max(),)
+        return self.model.predict(X, verbose=False).flatten()
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index 9333786437dfd8a9b422d2277f3cd199dea1a3ce..6ab2f9fd64aa394bd0c2a95adbdcdbeb0d8b08b4 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -17,7 +17,6 @@ from __future__ import (
     division,
     absolute_import,
 )
-from .amino_acid import amino_acid_letters
 
 
 def parse_int_list(s):
@@ -27,35 +26,37 @@ def parse_int_list(s):
 def split_uppercase_sequences(s):
     return [part.strip().upper() for part in s.split(",")]
 
+MHC_PREFIXES = [
+    "HLA",
+    "H-2",
+    "Mamu",
+    "Patr",
+    "Gogo",
+    "ELA",
+]
 
-def normalize_allele_name(allele_name):
+
+def normalize_allele_name(allele_name, default_prefix="HLA"):
     """
-    Only works for mouse, human, and rhesus monkey alleles.
+    Only works for a small number of species.
 
     TODO: use the same logic as mhctools for MHC name parsing.
     Possibly even worth its own small repo called something like "mhcnames"
     """
     allele_name = allele_name.upper()
-    if allele_name.startswith("MAMU"):
-        prefix = "Mamu-"
-    elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
-        prefix = "H-2-"
-    else:
-        prefix = ""
     # old school HLA-C serotypes look like "Cw"
     allele_name = allele_name.replace("CW", "C")
-    patterns = [
-        "HLA-",
-        "H-2",
-        "H2",
-        "MAMU",
-        "-",
-        "*",
-        ":"
-    ]
-    for pattern in patterns:
+
+    prefix = default_prefix
+    for candidate in MHC_PREFIXES:
+        if (allele_name.startswith(candidate.upper()) or
+                allele_name.startswith(candidate.replace("-", "").upper())):
+            prefix = candidate
+            allele_name = allele_name[len(prefix):]
+            break
+    for pattern in MHC_PREFIXES + ["-", "*", ":"]:
         allele_name = allele_name.replace(pattern, "")
-    return "%s%s" % (prefix, allele_name)
+    return "%s%s" % (prefix + "-" if prefix else "", allele_name)
 
 
 def split_allele_names(s):
@@ -64,37 +65,3 @@ def split_allele_names(s):
         for part
         in s.split(",")
     ]
-
-
-def expand_9mer_peptides(peptides, length):
-    """
-    Expand non-9mer peptides using methods from
-       Accurate approximation method for prediction of class I MHC
-       affinities for peptides of length 8, 10 and 11 using prediction
-       tools trained on 9mers.
-    by Lundegaard et. al.
-    http://bioinformatics.oxfordjournals.org/content/24/11/1397
-    """
-    assert len(peptides) > 0
-    if length < 8:
-        raise ValueError("Invalid peptide length: %d (%s)" % (
-            length, peptides[0]))
-    elif length == 9:
-        return peptides
-    elif length == 8:
-        # extend each peptide by inserting every possible amino acid
-        # between base-1 positions 4-8
-        return [
-            peptide[:i] + extra_amino_acid + peptide[i:]
-            for peptide in peptides
-            for i in range(3, 8)
-            for extra_amino_acid in amino_acid_letters
-        ]
-    else:
-        # drop interior residues between base-1 positions 4 to last
-        n_skip = length - 9
-        return [
-            peptide[:i] + peptide[i + n_skip:]
-            for peptide in peptides
-            for i in range(3, 9)
-        ]
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..895edecdfe9db51affda0233f1defc59549eaafe
--- /dev/null
+++ b/mhcflurry/data.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+from collections import namedtuple, defaultdict
+
+import pandas as pd
+import numpy as np
+
+from .common import normalize_allele_name
+from .amino_acid import common_amino_acids
+from .peptide_encoding import (
+    indices_to_hotshot_encoding,
+    fixed_length_from_many_peptides
+)
+
+index_encoding = common_amino_acids.index_encoding
+
+AlleleData = namedtuple(
+    "AlleleData",
+    [
+        "X_index",    # index-based featue encoding of fixed length peptides
+        "X_binary",  # binary encoding of fixed length peptides
+        "Y",     # regression encoding of IC50 (log scaled between 0..1)
+        "peptides",  # list of fixed length peptide string
+        "ic50",      # IC50 value associated with each entry
+        "original_peptides",  # original peptides may be of different lengths
+        "original_lengths",   # len(original_peptide)
+        "substring_counts",   # how many substrings were extracted from
+                              # each original peptide string
+        "weights",    # 1.0 / count
+    ])
+
+
+def _infer_csv_separator(filename):
+    """
+    Determine if file is separated by comma, tab, or whitespace.
+    Default to whitespace if the others are not detected.
+
+    Returns (sep, delim_whitespace)
+    """
+    for candidate in [",", "\t"]:
+        with open(filename, "r") as f:
+            for line in f:
+                if line.startswith("#"):
+                    continue
+                if candidate in line:
+                    return candidate, False
+    return None, True
+
+
+def load_dataframe(
+        filename,
+        peptide_length=None,
+        max_ic50=50000.0,
+        sep=None,
+        species_column_name="species",
+        allele_column_name="mhc",
+        peptide_column_name=None,
+        peptide_length_column_name="peptide_length",
+        ic50_column_name="meas",
+        only_human=True):
+    """
+    Load a dataframe of peptide-MHC affinity measurements
+
+    filename : str
+        TSV filename with columns:
+            - 'species'
+            - 'mhc'
+            - 'peptide_length'
+            - 'sequence'
+            - 'meas'
+
+     peptide_length : int, optional
+        Which length peptides to use (default=load all lengths)
+
+    max_ic50 : float
+        Treat IC50 scores above this value as all equally bad
+        (transform them to 0.0 in the regression output)
+
+    sep : str, optional
+        Separator in CSV file, default is to let Pandas infer
+
+    peptide_column_name : str, optional
+        Default behavior is to try  {"sequence", "peptide", "peptide_sequence"}
+
+    only_human : bool
+        Only load entries from human MHC alleles
+
+    Returns DataFrame augmented with extra columns:
+        - "log_ic50" : log(ic50) / log(max_ic50)
+        - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
+    """
+    if sep is None:
+        sep, delim_whitespace = _infer_csv_separator(filename)
+    else:
+        delim_whitespace = False
+    df = pd.read_csv(
+        filename,
+        sep=sep,
+        delim_whitespace=delim_whitespace,
+        engine="c")
+    # hack: get around variability of column naming by checking if
+    # the peptide_column_name is actually present and if not try "peptide"
+    if peptide_column_name is None:
+        columns = set(df.keys())
+        for candidate in ["sequence", "peptide", "peptide_sequence"]:
+            if candidate in columns:
+                peptide_column_name = candidate
+                break
+        if peptide_column_name is None:
+            raise ValueError(
+                "Couldn't find peptide column name, candidates: %s" % (
+                    columns))
+    if only_human:
+        human_mask = df[species_column_name] == "human"
+        df = df[human_mask]
+    if peptide_length is not None:
+        length_mask = df[peptide_length_column_name] == peptide_length
+        df = df[length_mask]
+
+    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
+    ic50 = np.array(df[ic50_column_name])
+    log_ic50 = np.log(ic50) / np.log(max_ic50)
+    df["log_ic50"] = log_ic50
+    regression_output = 1.0 - log_ic50
+    # clamp to values between 0, 1
+    regression_output = np.maximum(regression_output, 0.0)
+    regression_output = np.minimum(regression_output, 1.0)
+    df["regression_output"] = regression_output
+    return df, peptide_column_name
+
+
+def load_allele_dicts(
+        filename,
+        max_ic50=50000.0,
+        regression_output=False,
+        sep=None,
+        species_column_name="species",
+        allele_column_name="mhc",
+        peptide_column_name=None,
+        peptide_length_column_name="peptide_length",
+        ic50_column_name="meas",
+        only_human=True,
+        min_allele_size=1):
+    """
+    Parsing CSV of binding data into dictionary of dictionaries.
+    The outer key is an allele name, the inner key is a peptide sequence,
+    and the inner value is an IC50 or log-transformed value between [0,1]
+    """
+    binding_df, peptide_column_name = load_dataframe(
+        filename=filename,
+        max_ic50=max_ic50,
+        sep=sep,
+        species_column_name=species_column_name,
+        allele_column_name=allele_column_name,
+        peptide_column_name=peptide_column_name,
+        peptide_length_column_name=peptide_length_column_name,
+        ic50_column_name=ic50_column_name,
+        only_human=only_human)
+    # map peptides to either the raw IC50 or rescaled log IC50 depending
+    # on the `regression_output` parameter
+    output_column_name = (
+        "regression_output"
+        if regression_output
+        else ic50_column_name
+    )
+    return {
+        allele_name: {
+            peptide: output
+            for (peptide, output)
+            in zip(group[peptide_column_name], group[output_column_name])
+        }
+        for (allele_name, group)
+        in binding_df.groupby(allele_column_name)
+        if len(group) >= min_allele_size
+    }
+
+
+def encode_peptide_to_affinity_dict(
+        peptide_to_affinity_dict,
+        peptide_length=9,
+        flatten_binary_encoding=True):
+    """
+    Given a dictionary mapping from peptide sequences to affinity values,
+    returns tuple with the following fields:
+        - kmer_peptides: fixed length peptide strings
+        - original_peptides: variable length peptide strings
+        - counts: how many fixed length peptides were made from this original
+        - X_index: index encoding of fixed length peptides
+        - X_binary: binary encoding of fixed length peptides
+        - Y: affinity values associated with original peptides
+    """
+    raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
+    kmer_peptides, original_peptides, counts = \
+        fixed_length_from_many_peptides(
+            peptides=raw_peptides,
+            desired_length=peptide_length,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            start_offset_extend=0,
+            end_offset_extend=0)
+    n_samples = len(kmer_peptides)
+    assert n_samples == len(original_peptides), \
+        "Mismatch between # of samples (%d) and # of peptides (%d)" % (
+            n_samples, len(original_peptides))
+    assert n_samples == len(counts), \
+        "Mismatch between # of samples (%d) and # of counts (%d)" % (
+            n_samples, len(counts))
+
+    X_index = index_encoding(kmer_peptides, peptide_length)
+    X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+
+    assert X_binary.shape[0] == X_index.shape[0], \
+        ("Mismatch between number of samples for index encoding (%d)"
+         " vs. binary encoding (%d)") % (
+            X_binary.shape[0],
+            X_index.shape[0])
+
+    if flatten_binary_encoding:
+        # collapse 3D input into 2D matrix
+        n_binary_features = peptide_length * 20
+        X_binary = X_binary.reshape((n_samples, n_binary_features))
+
+    # easier to work with counts when they're an array instead of list
+    counts = np.array(counts)
+
+    Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
+    assert n_samples == len(Y), \
+        "Mismatch between # peptides %d and # regression outputs %d" % (
+            n_samples, len(Y))
+    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
+
+
+def load_allele_datasets(
+        filename,
+        peptide_length=9,
+        use_multiple_peptide_lengths=True,
+        max_ic50=50000.0,
+        flatten_binary_encoding=True,
+        sep=None,
+        species_column_name="species",
+        allele_column_name="mhc",
+        peptide_column_name=None,
+        peptide_length_column_name="peptide_length",
+        ic50_column_name="meas",
+        only_human=True):
+    """
+    Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
+    and log-transforms the IC50 measurement. Returns dictionary mapping allele
+    names to AlleleData objects (containing fields X, Y, ic50)
+
+    Parameters
+    ----------
+    filename : str
+        TSV filename with columns:
+            - 'species'
+            - 'mhc'
+            - 'peptide_length'
+            - 'sequence'
+            - 'meas'
+
+    peptide_length : int
+        Which length peptides to use (default=9)
+
+    use_multiple_peptide_lengths : bool
+        If a peptide is shorter than `peptide_length`, expand it into many
+        peptides of the appropriate length by inserting all combinations of
+        amino acids. Similarly, if a peptide is longer than `peptide_length`,
+        shorten it by deleting stretches of contiguous amino acids at all
+        peptide positions.
+
+    max_ic50 : float
+        Treat IC50 scores above this value as all equally bad
+        (transform them to 0.0 in the rescaled output)
+
+    flatten_binary_encoding : bool
+        If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
+        returns the 2D flattened version of the same data.
+
+    sep : str, optional
+        Separator in CSV file, default is to let Pandas infer
+
+    peptide_column_name : str, optional
+        Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
+
+    only_human : bool
+        Only load entries from human MHC alleles
+    """
+    df, peptide_column_name = load_dataframe(
+        filename=filename,
+        max_ic50=max_ic50,
+        sep=sep,
+        peptide_length=peptide_length,
+        species_column_name=species_column_name,
+        allele_column_name=allele_column_name,
+        peptide_column_name=peptide_column_name,
+        peptide_length_column_name=peptide_length_column_name,
+        ic50_column_name=ic50_column_name,
+        only_human=only_human)
+
+    allele_groups = {}
+    for allele, group in df.groupby(allele_column_name):
+        assert allele not in allele_groups, \
+            "Duplicate datasets for %s" % allele
+
+        raw_peptides = group[peptide_column_name]
+
+        # filter lengths in case user wants to drop peptides that are longer
+        # or shorter than the desired fixed length
+        if not use_multiple_peptide_lengths:
+            drop_mask = raw_peptides.str.len() != peptide_length
+            group = group[~drop_mask]
+            raw_peptides = raw_peptides[~drop_mask]
+
+        # convert from a Pandas column to a list, since that's easier to
+        # interact with later
+        raw_peptides = list(raw_peptides)
+
+        # create dictionaries of outputs from which we can look up values
+        # after peptides have been expanded
+        ic50_dict = {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(raw_peptides, group[ic50_column_name])
+        }
+
+        Y_dict = {
+            peptide: y
+            for (peptide, y)
+            in zip(raw_peptides, group["regression_output"])
+        }
+
+        (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
+            encode_peptide_to_affinity_dict(
+                Y_dict,
+                peptide_length=peptide_length,
+                flatten_binary_encoding=flatten_binary_encoding)
+
+        ic50 = np.array([ic50_dict[p] for p in original_peptides])
+
+        assert len(kmer_peptides) == len(ic50), \
+            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+                len(kmer_peptides), len(ic50))
+
+        allele_groups[allele] = AlleleData(
+            X_index=X_index,
+            X_binary=X_binary,
+            Y=Y,
+            ic50=ic50,
+            peptides=kmer_peptides,
+            original_peptides=original_peptides,
+            original_lengths=[len(peptide) for peptide in original_peptides],
+            substring_counts=counts,
+            weights=1.0 / counts)
+    return allele_groups
+
+
+def collapse_multiple_peptide_entries(allele_datasets):
+    """
+    If an (allele, peptide) pair occurs multiple times then reduce it
+    to a single entry by taking the weighted average of affinity values.
+
+    Returns a dictionary of alleles, each of which maps to a dictionary of
+    peptides that map to affinity values.
+    """
+    allele_to_peptide_to_affinity = {}
+    for (allele, dataset) in allele_datasets.items():
+        multiple_affinities = defaultdict(list)
+        for (peptide, normalized_affinity, weight) in zip(
+                dataset.peptides, dataset.Y, dataset.weights):
+            multiple_affinities[peptide].append((normalized_affinity, weight))
+        weighted_averages = {}
+        for peptide, affinity_weight_tuples in multiple_affinities.items():
+            denom = 0.0
+            sum_weighted_affinities = 0.0
+            for affinity, weight in affinity_weight_tuples:
+                sum_weighted_affinities += affinity * weight
+                denom += weight
+            if denom > 0:
+                weighted_averages[peptide] = sum_weighted_affinities / denom
+        allele_to_peptide_to_affinity[allele] = weighted_averages
+    return allele_to_peptide_to_affinity
diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data_helpers.py
deleted file mode 100644
index 9fd8834dba39c38e946b8f164d60e0482eb1c09c..0000000000000000000000000000000000000000
--- a/mhcflurry/data_helpers.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-from collections import namedtuple
-
-import pandas as pd
-import numpy as np
-
-from .common import normalize_allele_name
-from .amino_acid import amino_acid_letter_indices
-
-AlleleData = namedtuple("AlleleData", "X Y peptides ic50")
-
-
-def hotshot_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a binary matrix,
-    where each letter is transformed into a length 20 vector with a single
-    element that is 1 (and the others are 0).
-    """
-    shape = (len(peptides), peptide_length, 20)
-    X = np.zeros(shape, dtype=bool)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            k = amino_acid_letter_indices[amino_acid]
-            X[i, j, k] = 1
-    return X
-
-
-def index_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a vector of their
-    amino acid indices.
-    """
-    X = np.zeros((len(peptides), peptide_length), dtype=int)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            X[i, j] = amino_acid_letter_indices[amino_acid]
-    return X
-
-
-def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
-    """
-    Given an (n_samples, peptide_length) integer matrix
-    convert it to a binary encoding of shape:
-        (n_samples, peptide_length * n_indices)
-    """
-    (n_samples, peptide_length) = X.shape
-    if not n_indices:
-        n_indices = X.max() - first_index_value + 1
-
-    X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
-    for i, row in enumerate(X):
-        for j, xij in enumerate(row):
-            X_binary[i, n_indices * j + xij - first_index_value] = 1
-    return X_binary.astype(float)
-
-
-def _infer_csv_separator(filename):
-    """
-    Determine if file is separated by comma, tab, or whitespace.
-    Default to whitespace if the others are not detected.
-
-    Returns (sep, delim_whitespace)
-    """
-    for candidate in [",", "\t"]:
-        with open(filename, "r") as f:
-            for line in f:
-                if line.startswith("#"):
-                    continue
-                if candidate in line:
-                    return candidate, False
-    return None, True
-
-
-def load_dataframe(
-        filename,
-        peptide_length=None,
-        max_ic50=50000.0,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        peptide_length_column_name="peptide_length",
-        ic50_column_name="meas",
-        only_human=True):
-    """
-    Load a dataframe of peptide-MHC affinity measurements
-
-    filename : str
-        TSV filename with columns:
-            - 'species'
-            - 'mhc'
-            - 'peptide_length'
-            - 'sequence'
-            - 'meas'
-
-     peptide_length : int, optional
-        Which length peptides to use (default=load all lengths)
-
-    max_ic50 : float
-        Treat IC50 scores above this value as all equally bad
-        (transform them to 0.0 in the regression output)
-
-    sep : str, optional
-        Separator in CSV file, default is to let Pandas infer
-
-    peptide_column_name : str, optional
-        Default behavior is to try  {"sequence", "peptide", "peptide_sequence"}
-
-    only_human : bool
-        Only load entries from human MHC alleles
-
-    Returns DataFrame augmented with extra columns:
-        - "log_ic50" : log(ic50) / log(max_ic50)
-        - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
-    """
-    if sep is None:
-        sep, delim_whitespace = _infer_csv_separator(filename)
-    else:
-        delim_whitespace = False
-    df = pd.read_csv(
-        filename,
-        sep=sep,
-        delim_whitespace=delim_whitespace,
-        engine="c")
-    # hack: get around variability of column naming by checking if
-    # the peptide_column_name is actually present and if not try "peptide"
-    if peptide_column_name is None:
-        columns = set(df.keys())
-        for candidate in ["sequence", "peptide", "peptide_sequence"]:
-            if candidate in columns:
-                peptide_column_name = candidate
-                break
-        if peptide_column_name is None:
-            raise ValueError(
-                "Couldn't find peptide column name, candidates: %s" % (
-                    columns))
-    if only_human:
-        human_mask = df[species_column_name] == "human"
-        df = df[human_mask]
-    if peptide_length is not None:
-        length_mask = df[peptide_length_column_name] == peptide_length
-        df = df[length_mask]
-
-    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
-    ic50 = np.array(df[ic50_column_name])
-    log_ic50 = np.log(ic50) / np.log(max_ic50)
-    df["log_ic50"] = log_ic50
-    regression_output = 1.0 - log_ic50
-    # clamp to values between 0, 1
-    regression_output = np.maximum(regression_output, 0.0)
-    regression_output = np.minimum(regression_output, 1.0)
-    df["regression_output"] = regression_output
-    return df
-
-
-def load_allele_datasets(
-        filename,
-        peptide_length=9,
-        max_ic50=5000.0,
-        binary_encoding=True,
-        flatten_binary_encoding=True,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        peptide_length_column_name="peptide_length",
-        ic50_column_name="meas",
-        only_human=True):
-    """
-    Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
-    and log-transforms the IC50 measurement. Returns dictionary mapping allele
-    names to AlleleData objects (containing fields X, Y, ic50)
-
-    Parameters
-    ----------
-    filename : str
-        TSV filename with columns:
-            - 'species'
-            - 'mhc'
-            - 'peptide_length'
-            - 'sequence'
-            - 'meas'
-
-    peptide_length : int
-        Which length peptides to use (default=9)
-
-    max_ic50 : float
-        Treat IC50 scores above this value as all equally bad
-        (transform them to 0.0 in the rescaled output)
-
-    binary_encoding : bool
-        Encode amino acids of each peptide as indices or binary vectors
-
-    flatten_features : bool
-        If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
-        returns the 2D flattened version of the same data.
-
-    sep : str, optional
-        Separator in CSV file, default is to let Pandas infer
-
-    peptide_column_name : str, optional
-        Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
-
-    only_human : bool
-        Only load entries from human MHC alleles
-    """
-    df = load_dataframe(
-        filename=filename,
-        max_ic50=max_ic50,
-        sep=sep,
-        peptide_length=peptide_length,
-        species_column_name=species_column_name,
-        allele_column_name=allele_column_name,
-        peptide_column_name=peptide_column_name,
-        peptide_length_column_name=peptide_length_column_name,
-        ic50_column_name=ic50_column_name,
-        only_human=only_human)
-
-    allele_groups = {}
-    for allele, group in df.groupby(allele_column_name):
-        ic50 = np.array(group[ic50_column_name])
-        Y = np.array(group["regression_output"])
-        peptides = list(group[peptide_column_name])
-        if binary_encoding:
-            X = hotshot_encoding(peptides, peptide_length=peptide_length)
-            if flatten_binary_encoding:
-                # collapse 3D input into 2D matrix
-                X = X.reshape((X.shape[0], peptide_length * 20))
-        else:
-            X = index_encoding(peptides, peptide_length=peptide_length)
-        assert allele not in allele_groups, \
-            "Duplicate datasets for %s" % allele
-        allele_groups[allele] = AlleleData(
-            X=X,
-            Y=Y,
-            ic50=ic50,
-            peptides=peptides)
-    return allele_groups
diff --git a/mhcflurry/ensemble.py b/mhcflurry/ensemble.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0373607d68a7680e2905c3b23d053a6fe0a89b6
--- /dev/null
+++ b/mhcflurry/ensemble.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class Ensemble(object):
+    def __init__(self, models, name=None):
+        self.name = name
+        self.models = models
+
+    @classmethod
+    def from_directory(cls, directory_path):
+        files = os.listdir(directory_path)
+
+
+
diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py
index 7f09c73def7612508d33f44981663b7f8aa785f9..f2c7876bc520d6f6513b4922335293fba07aabfe 100644
--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -128,9 +128,10 @@ def make_hotshot_network(
         output_activation="sigmoid",
         dropout_probability=0.0,
         optimizer=None,
-        learning_rate=0.001):
+        learning_rate=0.001,
+        n_amino_acids=20):
     return make_network(
-        input_size=peptide_length * 20,
+        input_size=peptide_length * n_amino_acids,
         layer_sizes=layer_sizes,
         activation=activation,
         init=init,
diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py
deleted file mode 100644
index ffde0cbeeb04c00b58735b33fd8dc3b18f9bb17b..0000000000000000000000000000000000000000
--- a/mhcflurry/mhc1_binding_predictor.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Allele specific MHC Class I binding affinity predictor
-"""
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-from os import listdir
-from os.path import exists, join
-from itertools import groupby
-import json
-
-import numpy as np
-import pandas as pd
-from keras.models import model_from_config
-
-from .class1_allele_specific_hyperparameters import MAX_IC50
-from .data_helpers import index_encoding, normalize_allele_name
-from .paths import CLASS1_MODEL_DIRECTORY
-from .common import expand_9mer_peptides
-
-_allele_model_cache = {}
-
-
-class Mhc1BindingPredictor(object):
-    def __init__(
-            self,
-            allele,
-            model_directory=CLASS1_MODEL_DIRECTORY,
-            max_ic50=MAX_IC50):
-        self.max_ic50 = max_ic50
-        if not exists(model_directory) or len(listdir(model_directory)) == 0:
-            raise ValueError(
-                "No MHC prediction models found in %s" % (model_directory,))
-        original_allele_name = allele
-        allele = self.allele = normalize_allele_name(allele)
-        if self.allele not in _allele_model_cache:
-            json_filename = self.allele + ".json"
-            json_path = join(model_directory, json_filename)
-            if not exists(json_path):
-                raise ValueError("Unsupported allele: %s" % (
-                    original_allele_name,))
-
-            hdf_filename = self.allele + ".hdf"
-            hdf_path = join(model_directory, hdf_filename)
-
-            if not exists(hdf_path):
-                raise ValueError("Missing model weights for allele %s" % (
-                    original_allele_name,))
-
-            with open(json_path, "r") as f:
-                config_dict = json.load(f)
-            self.model = model_from_config(config_dict)
-            self.model.load_weights(hdf_path)
-            _allele_model_cache[self.allele] = self.model
-            self.model.compile(loss="mse", optimizer="rmsprop")
-        else:
-            self.model = _allele_model_cache[self.allele]
-
-    def __repr__(self):
-        return "Mhc1BindingPredictor(allele=%s, model_directory=%s)" % (
-            self.allele,
-            self.model_directory)
-
-    def __str__(self):
-        return repr(self)
-
-    def _log_to_ic50(self, log_value):
-        """
-        Convert neural network output to IC50 values between 0.0 and
-        self.max_ic50 (typically 5000, 20000 or w0)
-        """
-        return self.max_ic50 ** (1.0 - log_value)
-
-    def _predict_9mer_peptides(self, peptides):
-        """
-        Predict binding affinity for 9mer peptides
-        """
-        if any(len(peptide) != 9 for peptide in peptides):
-            raise ValueError("Can only predict 9mer peptides")
-        X = index_encoding(peptides, peptide_length=9)
-        return self.model.predict(X, verbose=False).flatten()
-
-    def _predict_9mer_peptides_ic50(self, peptides):
-        log_y = self._predict_9mer_peptides(peptides)
-        return self._log_to_ic50(log_y)
-
-    def predict_peptides(self, peptides):
-        column_names = [
-            "Allele",
-            "Peptide",
-            "Prediction",
-        ]
-        results = {}
-        for column_name in column_names:
-            results[column_name] = []
-
-        for length, group_peptides in groupby(peptides, lambda x: len(x)):
-            group_peptides = list(group_peptides)
-            expanded_peptides = expand_9mer_peptides(group_peptides, length)
-            n_group = len(group_peptides)
-            n_expanded = len(expanded_peptides)
-            expansion_factor = int(n_expanded / n_group)
-            raw_y = self._predict_9mer_peptides(expanded_peptides)
-            median_y = np.zeros(n_group)
-            # take the median of each group of log(IC50) values
-            for i in range(n_group):
-                start = i * expansion_factor
-                end = (i + 1) * expansion_factor
-                median_y[i] = np.median(raw_y[start:end])
-            ic50 = self._log_to_ic50(median_y)
-            results["Allele"].extend([self.allele] * n_group)
-            results["Peptide"].extend(group_peptides)
-            results["Prediction"].extend(ic50)
-        return pd.DataFrame(results, columns=column_names)
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07a41b8407179e62e4d8aebd39784be4c62b6b6
--- /dev/null
+++ b/mhcflurry/peptide_encoding.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import logging
+
+import numpy as np
+
+from .amino_acid import common_amino_acids, amino_acids_with_unknown
+
+common_amino_acid_letters = common_amino_acids.letters()
+
+
+def all_kmers(k, alphabet=common_amino_acid_letters):
+    """
+    Generates all k-mer peptide sequences
+
+    Parameters
+    ----------
+    k : int
+
+    alphabet : str | list of characters
+    """
+    alphabets = [alphabet] * k
+    return [
+        "".join(combination)
+        for combination
+        in itertools.product(*alphabets)
+    ]
+
+
+class CombinatorialExplosion(Exception):
+    pass
+
+
+def extend_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        insert_amino_acid_letters=common_amino_acid_letters):
+    """Extend peptide by inserting every possible amino acid combination
+    if we're trying to e.g. turn an 8mer into 9mers.
+
+    Parameters
+    ----------
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+        How many characters (from the position before the start of the string)
+        to skip before inserting characters.
+
+
+    end_offset : int
+        Last character position from the end where we insert new characters,
+        where 0 is the position after the last character.
+
+    insert_alphabet : str | list of character
+    """
+    n = len(peptide)
+    assert n < desired_length, \
+        "%s (length = %d) is too long! Must be shorter than %d" % (
+            peptide, n, desired_length)
+    n_missing = desired_length - n
+    if n_missing > 3:
+        raise CombinatorialExplosion(
+            "Cannot transform %s of length %d into a %d-mer peptide" % (
+                peptide, n, desired_length))
+    return [
+        peptide[:i] + extra + peptide[i:]
+        for i in range(start_offset, n - end_offset + 1)
+        for extra in all_kmers(
+            n_missing,
+            alphabet=insert_amino_acid_letters)
+    ]
+
+
+def shorten_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        insert_amino_acid_letters=common_amino_acid_letters):
+    """Shorten peptide if trying to e.g. turn 10mer into 9mers
+
+    Parameters
+    ----------
+
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+
+    end_offset : int
+
+    alphabet : str | list of characters
+    """
+    n = len(peptide)
+    assert n > desired_length, \
+        "%s (length = %d) is too short! Must be longer than %d" % (
+            peptide, n, desired_length)
+    n_skip = n - desired_length
+    assert n_skip > 0, \
+        "Expected length of peptide %s %d to be greater than %d" % (
+            peptide, n, desired_length)
+    end_range = n - end_offset - n_skip + 1
+    return [
+        peptide[:i] + peptide[i + n_skip:]
+        for i in range(start_offset, end_range)
+    ]
+
+
+def fixed_length_from_many_peptides(
+        peptides,
+        desired_length,
+        start_offset_extend=2,
+        end_offset_extend=1,
+        start_offset_shorten=2,
+        end_offset_shorten=0,
+        insert_amino_acid_letters=common_amino_acid_letters):
+    """
+    Create a set of fixed-length k-mer peptides from a collection of varying
+    length peptides.
+
+
+    Shorter peptides are filled in using all possible amino acids at any
+    insertion site between start_offset_shorten and -end_offset_shorten
+    where start_offset_extend=0 represents insertions before the string
+    and end_offset_extend=0 represents insertions after the string's ending.
+
+    Longer peptides are shortened by deleting contiguous residues, starting
+    from start_offset_shorten and ending with -end_offset_shorten. Unlike
+    peptide extensions, the offsets for shortening a peptide range between
+    the first and last positions (rather than between the positions *before*
+    the string starts and the position *after*).
+
+    We can recreate the methods from:
+       Accurate approximation method for prediction of class I MHC
+       affinities for peptides of length 8, 10 and 11 using prediction
+       tools trained on 9mers.
+    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
+    with the following settings:
+        - desired_length = 9
+        - start_offset_extend = 3
+        - end_offset_extend = 2
+        - start_offset_shorten = 3
+        - end_offset_shorten = 1
+
+    Returns three lists:
+        - a list of fixed length peptides (all of length `desired_length`)
+        - a list of the original peptides from which subsequences were
+          contracted or lengthened
+        - a list of integers indicating the number of fixed length peptides
+         generated from each variable length peptide.
+
+    Example:
+        kmers, original, counts = fixed_length_from_many_peptides(
+            peptides=["ABC", "A"]
+            desired_length=2,
+            start_offset_extend=0,
+            end_offset_extend=0,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            insert_amino_acid_letters="ABC")
+        kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
+        original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
+        counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
+
+    Parameters
+    ----------
+    peptide : list of str
+
+    desired_length : int
+
+    start_offset_extend : int
+
+    end_offset_extend : int
+
+    start_offset_shorten : int
+
+    end_offset_shorten : int
+
+    insert_amino_acid_letters : str | list of characters
+    """
+    all_fixed_length_peptides = []
+    original_peptide_sequences = []
+    counts = []
+    for peptide in peptides:
+        n = len(peptide)
+        if n == desired_length:
+            fixed_length_peptides = [peptide]
+        elif n < desired_length:
+            try:
+                fixed_length_peptides = extend_peptide(
+                    peptide=peptide,
+                    desired_length=desired_length,
+                    start_offset=start_offset_extend,
+                    end_offset=end_offset_extend,
+                    insert_amino_acid_letters=insert_amino_acid_letters)
+            except CombinatorialExplosion:
+                logging.warn(
+                    "Peptide %s is too short to be extended to length %d" % (
+                        peptide, desired_length))
+                continue
+        else:
+            fixed_length_peptides = shorten_peptide(
+                peptide=peptide,
+                desired_length=desired_length,
+                start_offset=start_offset_shorten,
+                end_offset=end_offset_shorten,
+                insert_amino_acid_letters=insert_amino_acid_letters)
+
+        n_fixed_length = len(fixed_length_peptides)
+        all_fixed_length_peptides.extend(fixed_length_peptides)
+        original_peptide_sequences.extend([peptide] * n_fixed_length)
+        counts.extend([n_fixed_length] * n_fixed_length)
+    return all_fixed_length_peptides, original_peptide_sequences, counts
+
+
+def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
+    """
+    Given an (n_samples, peptide_length) integer matrix
+    convert it to a binary encoding of shape:
+        (n_samples, peptide_length * n_indices)
+    """
+    (n_samples, peptide_length) = X.shape
+    if not n_indices:
+        n_indices = X.max() - first_index_value + 1
+    X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
+    for i, row in enumerate(X):
+        for j, xij in enumerate(row):
+            X_binary[i, n_indices * j + xij - first_index_value] = 1
+    return X_binary.astype(float)
+
+
+def fixed_length_index_encoding(
+        peptides,
+        desired_length,
+        start_offset_shorten=0,
+        end_offset_shorten=0,
+        start_offset_extend=0,
+        end_offset_extend=0,
+        allow_unknown_amino_acids=False):
+    """
+    Take peptides of varying lengths, chop them into substrings of fixed
+    length and apply index encoding to these substrings.
+
+    If a string is longer than the desired length, then it's reduced to
+    the desired length by deleting characters at all possible positions. When
+    positions at the start or end of a string should be exempt from deletion
+    then the number of exempt characters can be controlled via
+    `start_offset_shorten` and `end_offset_shorten`.
+
+    If a string is shorter than the desired length then it is filled
+    with all possible characters of the alphabet at all positions. The
+    parameters `start_offset_extend` and `end_offset_extend` control whether
+    certain positions are excluded from insertion. The positions are
+    in a "inter-residue" coordinate system, where `start_offset_extend` = 0
+    refers to the position *before* the start of a peptide and, similarly,
+    `end_offset_extend` = 0 refers to the position *after* the peptide.
+
+    Returns feature matrix X, a list of original peptides for each feature
+    vector, and a list of integer counts indicating how many rows share a
+    particular original peptide. When two rows are expanded out of a single
+    original peptide, they will both have a count of 2. These counts can
+    be useful for down-weighting the importance of multiple feature vectors
+    which originate from the same sample.
+    """
+    if allow_unknown_amino_acids:
+        insert_letters = ["X"]
+        index_encoding = amino_acids_with_unknown.index_encoding
+    else:
+        insert_letters = common_amino_acid_letters
+        index_encoding = common_amino_acids.index_encoding
+
+    fixed_length, original, counts = fixed_length_from_many_peptides(
+        peptides=peptides,
+        desired_length=desired_length,
+        start_offset_shorten=start_offset_shorten,
+        end_offset_shorten=end_offset_shorten,
+        start_offset_extend=start_offset_extend,
+        end_offset_extend=end_offset_extend,
+        insert_amino_acid_letters=insert_letters)
+    X = index_encoding(fixed_length, desired_length)
+    return X, original, counts
diff --git a/mhcflurry/predict.py b/mhcflurry/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aef5dcafb05b51b0126cfdf728fef435984c040
--- /dev/null
+++ b/mhcflurry/predict.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import OrderedDict
+
+import pandas as pd
+
+from .paths import CLASS1_MODEL_DIRECTORY
+from .mhc1_binding_predictor import Mhc1BindingPredictor
+
+def predict(alleles, peptides):
+    allele_dataframes = OrderedDict([])
+    for allele in alleles:
+        model = Mhc1BindingPredictor(allele=allele)
+        result_dictionary = model.predict_peptides(peptides)
+        allele_dataframes.append(df)
+    return pd.concat(allele_dataframes)
diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9939f55361a56af08458f4586a98bf463d0f33cc
--- /dev/null
+++ b/mhcflurry/predictor_base.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import numpy as np
+
+from .peptide_encoding import fixed_length_index_encoding
+from .amino_acid import (
+    amino_acids_with_unknown,
+    common_amino_acids
+)
+
+
+class PredictorBase(object):
+    """
+    Base class for all mhcflurry predictors (including the Ensemble class)
+    """
+
+    def __init__(
+            self,
+            name,
+            max_ic50,
+            allow_unknown_amino_acids,
+            verbose):
+        self.name = name
+        self.max_ic50 = max_ic50
+        self.allow_unknown_amino_acids = allow_unknown_amino_acids
+        self.verbose = verbose
+
+    def log_to_ic50(self, log_value):
+        """
+        Convert neural network output to IC50 values between 0.0 and
+        self.max_ic50 (typically 5000, 20000 or 50000)
+        """
+        return self.max_ic50 ** (1.0 - log_value)
+
+    def encode_9mer_peptides(self, peptides):
+        if self.allow_unknown_amino_acids:
+            return amino_acids_with_unknown.index_encoding(peptides, 9)
+        else:
+            return common_amino_acids.index_encoding(peptides, 9)
+
+    def encode_peptides(self, peptides):
+        """
+        Parameters
+        ----------
+        peptides : str list
+            Peptide strings of any length
+
+        Encode peptides of any length into fixed length vectors.
+        Returns 2d array of encoded peptides and 1d array indicating the
+        original peptide index for each row.
+        """
+        indices = []
+        encoded_matrices = []
+        for i, peptide in enumerate(peptides):
+            matrix, _, _ = fixed_length_index_encoding(
+                peptides=[peptide],
+                desired_length=9,
+                allow_unknown_amino_acids=self.allow_unknown_amino_acids)
+            encoded_matrices.append(matrix)
+            indices.extend([i] * len(matrix))
+        combined_matrix = np.concatenate(encoded_matrices)
+        index_array = np.array(indices)
+        expected_shape = (len(index_array), 9)
+        assert combined_matrix.shape == expected_shape, \
+            "Expected shape %s but got %s" % (expected_shape, combined_matrix.shape)
+        return combined_matrix, index_array
+
+    def predict_9mer_peptides(self, peptides):
+        """
+        Predict binding affinity for 9mer peptides
+        """
+        if any(len(peptide) != 9 for peptide in peptides):
+            raise ValueError("Can only predict 9mer peptides")
+        X, _ = self.encode_peptides(peptides)
+        return self.predict_encoded(X)
+
+    def predict_9mer_peptides_ic50(self, peptides):
+        return self.log_to_ic50(self.predict_9mer_peptides(peptides))
+
+    def predict_peptides_ic50(self, peptides):
+        """
+        Predict IC50 affinities for peptides of any length
+        """
+        return self.log_to_ic50(
+            self.predict_peptides(peptides))
+
+    def predict_encoded(self, X):
+        raise ValueError("Not yet implemented for %s!" % (
+            self.__class__.__name__,))
+
+    def predict_peptides(
+            self,
+            peptides,
+            combine_fn=np.mean):
+        """
+        Given a list of peptides of any length, returns an array of predicted
+        normalized affinity values. Unlike IC50, a higher value here
+        means a stronger affinity. Peptides of lengths other than 9 are
+        transformed into a set of 9mers either by deleting or inserting
+        amino acid characters. The prediction for a single peptide will be
+        the average of expanded 9mers.
+        """
+        input_matrix, original_peptide_indices = self.encode_peptides(peptides)
+        # non-9mer peptides get multiple predictions, which are then combined
+        # with the combine_fn argument
+        multiple_predictions_dict = defaultdict(list)
+        fixed_length_predictions = self.predict_encoded(input_matrix)
+        for i, yi in enumerate(fixed_length_predictions):
+            original_peptide_index = original_peptide_indices[i]
+            original_peptide = peptides[original_peptide_index]
+            multiple_predictions_dict[original_peptide].append(yi)
+
+        combined_predictions_dict = {
+            p: combine_fn(ys)
+            for (p, ys) in multiple_predictions_dict.items()
+        }
+        return np.array([combined_predictions_dict[p] for p in peptides])
diff --git a/mhcflurry/score_set.py b/mhcflurry/score_set.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f833a3c8715eb0e444b70ebd8ad414365da1f3e
--- /dev/null
+++ b/mhcflurry/score_set.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+from collections import OrderedDict
+
+import numpy as np
+
+
+class ScoreSet(object):
+    """
+    Useful for keeping a collection of score dictionaries
+    which map name->score type->list of values.
+    """
+    def __init__(self, verbose=True, index="name"):
+        self.groups = {}
+        self.verbose = verbose
+        if isinstance(index, (list, tuple)):
+            index = ",".join("%s" % item for item in index)
+        self.index = index
+
+    def add_many(self, group, **kwargs):
+        for (k, v) in sorted(kwargs.items()):
+            self.add(group, k, v)
+
+    def add(self, group, score_type, value):
+        if isinstance(group, (list, tuple)):
+            group = ",".join("%s" % item for item in group)
+        if group not in self.groups:
+            self.groups[group] = {}
+        if score_type not in self.groups[group]:
+            self.groups[group][score_type] = []
+        self.groups[group][score_type].append(value)
+        if self.verbose:
+            print("--> %s:%s %0.4f" % (group, score_type, value))
+
+    def score_types(self):
+        result = set([])
+        for (g, d) in sorted(self.groups.items()):
+            for score_type in sorted(d.keys()):
+                result.add(score_type)
+        return list(sorted(result))
+
+    def _reduce_scores(self, reduce_fn):
+        score_types = self.score_types()
+        return {
+            group:
+                OrderedDict([
+                    (score_type, reduce_fn(score_dict[score_type]))
+                    for score_type
+                    in score_types
+                ])
+            for (group, score_dict)
+            in self.groups.items()
+        }
+
+    def averages(self):
+        return self._reduce_scores(np.mean)
+
+    def stds(self):
+        return self._reduce_scores(np.std)
+
+    def to_csv(self, filename):
+        with open(filename, "w") as f:
+            header_list = [self.index]
+            score_types = self.score_types()
+            for score_type in score_types:
+                header_list.append(score_type)
+                header_list.append(score_type + "_std")
+
+            header_line = ",".join(header_list) + "\n"
+            if self.verbose:
+                print(header_line)
+            f.write(header_line)
+
+            score_averages = self.averages()
+            score_stds = self.stds()
+
+            for name in sorted(score_averages.keys()):
+                line_elements = [name]
+                for score_type in score_types:
+                    line_elements.append(
+                        "%0.4f" % score_averages[name][score_type])
+                    line_elements.append(
+                        "%0.4f" % score_stds[name][score_type])
+                line = ",".join(line_elements) + "\n"
+                if self.verbose:
+                    print(line)
+                f.write(line)
diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py
index 486e8fe3ea90503cafe4c5c0fd9864ce53096196..7d7e5fc2fe826a7fd30795e0f9d2fbf96f4eebc4 100755
--- a/scripts/create-combined-class1-dataset.py
+++ b/scripts/create-combined-class1-dataset.py
@@ -29,33 +29,39 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--ic50-fraction-tolerance",
+parser.add_argument(
+    "--ic50-fraction-tolerance",
     default=0.01,
     type=float,
     help=(
         "How much can the IEDB and NetMHCpan IC50 differ and still be"
         " considered compatible (as a fraction of the NetMHCpan value)"))
 
-parser.add_argument("--min-assay-overlap-size",
+parser.add_argument(
+    "--min-assay-overlap-size",
     type=int,
     default=1,
     help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data")
 
 
-parser.add_argument("--min-assay-fraction-same",
+parser.add_argument(
+    "--min-assay-fraction-same",
     type=float,
     help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data",
     default=0.9)
 
-parser.add_argument("--iedb-pickle-path",
+parser.add_argument(
+    "--iedb-pickle-path",
     default=IEDB_PICKLE_PATH,
     help="Path to .pickle file containing dictionary of IEDB assay datasets")
 
-parser.add_argument("--netmhcpan-csv-path",
+parser.add_argument(
+    "--netmhcpan-csv-path",
     default=PETERS_CSV_PATH,
     help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
 
-parser.add_argument("--output-csv-path",
+parser.add_argument(
+    "--output-csv-path",
     default=CLASS1_DATA_CSV_PATH,
     help="Path to CSV of combined assay results")
 
@@ -137,7 +143,7 @@ if __name__ == "__main__":
     for (allele, count) in new_allele_counts.most_common():
         print("%s: %d" % (allele, count))
     print("Combined DataFrame size: %d (+%d)" % (
-            len(combined_df),
-            len(combined_df) - len(nielsen_data)))
+        len(combined_df),
+        len(combined_df) - len(nielsen_data)))
     print("Writing %s..." % args.output_csv_path)
     combined_df.to_csv(args.output_csv_path, index=False)
diff --git a/scripts/mhcflurry-class1-web-server.py b/scripts/mhcflurry-class1-web-server.py
index 4fe36f34f444786e24bf8170decc851c0ec83cdc..128fe452c1f115e350ab7805afcb1e795f56a786 100755
--- a/scripts/mhcflurry-class1-web-server.py
+++ b/scripts/mhcflurry-class1-web-server.py
@@ -29,6 +29,7 @@ from mhcflurry.common import (
 )
 from mhcflurry.class1 import predict, supported_alleles
 
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("--host", default="0.0.0.0")
@@ -52,6 +53,7 @@ def get_binding_value():
         return "ERROR: %s" % e.args[0]
     return result_df.to_csv(sep="\t", index=False, float_format="%0.4f")
 
+
 @get('/alleles')
 def get_supported_alleles():
     peptide_lengths = "8,9,10,11,12"
@@ -61,6 +63,7 @@ def get_supported_alleles():
     ]
     return "\n".join(strings)
 
+
 if __name__ == "__main__":
     args = parser.parse_args()
-    run(host=args.host, port=args.port, debug=args.debug)
+    run(host=args.host, port=args.port, debug=args.debug, server="cherrypy")
diff --git a/setup.py b/setup.py
index f502b92b81f7b25d2b2d3e4fb4353ed39ee3f772..334c4e4818fd0a02682c499c71f89fdad3edb199 100644
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,8 @@ if __name__ == '__main__':
             'appdirs',
             'theano',
             'keras',
+            # using for multi-threaded web server
+            'cherrypy'
         ],
         long_description=readme,
         packages=['mhcflurry'],
diff --git a/test/test_class1_binding_predictor.py b/test/test_class1_binding_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c824ce1a8bcf39df50d0611405d4d9c92e251f58
--- /dev/null
+++ b/test/test_class1_binding_predictor.py
@@ -0,0 +1,115 @@
+import numpy as np
+
+from mhcflurry import Class1BindingPredictor
+
+
+class Dummy9merIndexEncodingModel(object):
+    """
+    Dummy molde used for testing the pMHC binding predictor.
+    """
+    def predict(self, X, verbose=False):
+        assert isinstance(X, np.ndarray)
+        assert len(X.shape) == 2
+        n_rows, n_cols = X.shape
+        n_cols == 9, "Expected 9mer index input input, got %d columns" % (
+            n_cols,)
+        return np.zeros(n_rows, dtype=float)
+
+
+def test_always_zero_9mer_inputs():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    test_9mer_peptides = [
+        "SIISIISII",
+        "AAAAAAAAA",
+    ]
+
+    n_expected = len(test_9mer_peptides)
+    y = predictor.predict_peptides(test_9mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(y == 0)
+
+    # call the predict method for 9mers directly
+    y = predictor.predict_9mer_peptides(test_9mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(y == 0)
+
+    ic50 = predictor.predict_peptides_ic50(test_9mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(ic50 == predictor.max_ic50), ic50
+
+
+def test_always_zero_8mer_inputs():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    test_8mer_peptides = [
+        "SIISIISI",
+        "AAAAAAAA",
+    ]
+
+    n_expected = len(test_8mer_peptides)
+    y = predictor.predict_peptides(test_8mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(y == 0)
+
+    ic50 = predictor.predict_peptides_ic50(test_8mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(ic50 == predictor.max_ic50), ic50
+
+
+def test_always_zero_10mer_inputs():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    test_10mer_peptides = [
+        "SIISIISIYY",
+        "AAAAAAAAYY",
+    ]
+
+    n_expected = len(test_10mer_peptides)
+    y = predictor.predict_peptides(test_10mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(y == 0)
+
+    ic50 = predictor.predict_peptides_ic50(test_10mer_peptides)
+    assert len(y) == n_expected
+    assert np.all(ic50 == predictor.max_ic50), ic50
+
+
+def test_encode_peptides_9mer():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    X = predictor.encode_9mer_peptides(["AAASSSYYY"])
+    assert X.shape[0] == 1, X.shape
+    assert X.shape[1] == 9, X.shape
+
+    X, indices = predictor.encode_peptides(["AAASSSYYY"])
+    assert len(indices) == 1
+    assert indices[0] == 0
+    assert X.shape[0] == 1, X.shape
+    assert X.shape[1] == 9, X.shape
+
+
+def test_encode_peptides_8mer():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    X, indices = predictor.encode_peptides(["AAASSSYY"])
+    assert len(indices) == 9
+    assert (indices == 0).all()
+    assert X.shape[0] == 9, (X.shape, X)
+    assert X.shape[1] == 9, (X.shape, X)
+
+
+def test_encode_peptides_10mer():
+    predictor = Class1BindingPredictor(
+        model=Dummy9merIndexEncodingModel(),
+        allow_unknown_amino_acids=True)
+    X, indices = predictor.encode_peptides(["AAASSSYYFF"])
+    assert len(indices) == 10
+    assert (indices == 0).all()
+    assert X.shape[0] == 10, (X.shape, X)
+    assert X.shape[1] == 9, (X.shape, X)
diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..733ef8a8f49737762df93479457c9b38399a5c7d
--- /dev/null
+++ b/test/test_fixed_length_peptides.py
@@ -0,0 +1,109 @@
+from mhcflurry.peptide_encoding import (
+    all_kmers,
+    extend_peptide,
+    shorten_peptide,
+    fixed_length_from_many_peptides,
+)
+from nose.tools import eq_
+
+
+def test_all_kmers():
+    kmers = all_kmers(2, alphabet=["A", "B"])
+    assert len(kmers) == 4, kmers
+    eq_(set(kmers), {"AA", "AB", "BA", "BB"})
+
+
+def test_all_kmers_string_alphabet():
+    kmers = all_kmers(2, alphabet="AB")
+    assert len(kmers) == 4, kmers
+    eq_(set(kmers), {"AA", "AB", "BA", "BB"})
+
+
+def test_extend_peptide_all_positions():
+    # insert 0 or 1 at every position
+    results = extend_peptide(
+        "111",
+        desired_length=4,
+        start_offset=0,
+        end_offset=0,
+        insert_amino_acid_letters="01")
+
+    expected = [
+        "0111",
+        "1111",
+        "1011",
+        "1111",
+        "1101",
+        "1111",
+        "1110",
+        "1111",
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=0,
+        end_offset=0,
+        insert_amino_acid_letters="012")
+
+    expected = [
+        "12",
+        "02",
+        "01"
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions_except_first():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=1,
+        end_offset=0,
+        insert_amino_acid_letters="012")
+
+    expected = [
+        "02",
+        "01",
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions_except_last():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=0,
+        end_offset=1,
+        insert_amino_acid_letters="012")
+
+    expected = [
+        "12",
+        "02",
+    ]
+    eq_(results, expected)
+
+
+def test_fixed_length_from_many_peptides():
+    kmers, original, counts = fixed_length_from_many_peptides(
+        peptides=["ABC", "A"],
+        desired_length=2,
+        start_offset_extend=0,
+        end_offset_extend=0,
+        start_offset_shorten=0,
+        end_offset_shorten=0,
+        insert_amino_acid_letters="ABC")
+    print(kmers)
+    print(original)
+    print(counts)
+    eq_(len(kmers), len(original))
+    eq_(len(kmers), len(counts))
+    eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"])
+    eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"])
+    eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6])