Merge pull request #12 from hammerlab/soft-pretraining

Soft-pretraining and lots of other changes

Merge pull request #12 from hammerlab/soft-pretraining
Soft-pretraining and lots of other changes
bb4701a0 · Alex Rubinsteyn · 5a856ce8 · d2989115 · bb4701a0 · bb4701a0
Commit bb4701a0 authored 9 years ago by Alex Rubinsteyn
--- a/experiments/allele_similarities.py
+++ b/experiments/allele_similarities.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper functions for computing pairwise similarities between alleles
+"""
+
+import numpy as np
+import seaborn
+from fancyimpute import NuclearNormMinimization
+
+from common import matrix_to_dictionary, curry_dictionary
+
+
+def compute_similarities_for_single_allele_from_peptide_overlap(
+        allele_name,
+        allele_to_peptide_to_affinity,
+        allele_name_length=None,
+        min_weight=1.0):
+    """
+    Parameters
+    ----------
+    allele_name : str
+
+    allele_to_peptide_to_affinity : dict
+        Dictionary from allele anmes to a dictionary of (peptide -> affinity)
+
+    Returns three dictionaries, mapping alleles to
+        - similarity
+        - number of overlapping peptides
+        - total weight of overlapping affinitie
+    """
+    sims = {}
+    overlaps = {}
+    weights = {}
+    dataset_a = allele_to_peptide_to_affinity[allele_name]
+    peptide_set_a = set(dataset_a.keys())
+    for other_allele, dataset_b in allele_to_peptide_to_affinity.items():
+        if allele_name_length and len(other_allele) != allele_name_length:
+            continue
+        peptide_set_b = set(dataset_b.keys())
+        intersection = peptide_set_a.intersection(peptide_set_b)
+        overlaps[other_allele] = len(intersection)
+        total = 0.0
+        weight = 0.0
+        for peptide in intersection:
+            ya = dataset_a[peptide]
+            yb = dataset_b[peptide]
+            minval = min(ya, yb)
+            maxval = max(ya, yb)
+            total += minval
+            weight += maxval
+        weights[other_allele] = weight
+        if weight > min_weight:
+            sims[other_allele] = total / weight
+    return sims, overlaps, weights
+
+
+def compute_partial_similarities_from_peptide_overlap(
+        allele_to_peptide_to_affinity,
+        min_weight=1.0,
+        allele_name_length=None):
+    """
+    Determine similarity between pairs of alleles by examining
+    affinity values for overlapping peptides. Returns curried dictionaries
+    mapping allele -> allele -> float, where the final value is one of:
+        - similarity between alleles
+        - # overlapping peptides
+        - sum weight of overlapping affinities
+    """
+    sims = {}
+    overlaps = {}
+    weights = {}
+    for allele_name in allele_to_peptide_to_affinity.keys():
+        if allele_name_length and len(allele_name) != allele_name_length:
+            continue
+        curr_sims, curr_overlaps, curr_weights = \
+            compute_similarities_for_single_allele_from_peptide_overlap(
+                allele_name=allele_name,
+                allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+                allele_name_length=allele_name_length,
+                min_weight=min_weight)
+        sims[allele_name] = curr_sims
+        overlaps[allele_name] = curr_overlaps
+        weights[allele_name] = curr_weights
+    return sims, overlaps, weights
+
+
+def build_incomplete_similarity_matrix(
+        allele_to_peptide_to_affinity,
+        curried_sims_dict):
+    allele_list = list(sorted(allele_to_peptide_to_affinity.keys()))
+    allele_order = {
+        allele_name: i
+        for (i, allele_name) in enumerate(allele_list)
+    }
+    n_alleles = len(allele_list)
+    sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float)
+
+    for a, ai in allele_order.items():
+        for b, bi in allele_order.items():
+            # if allele pair is missing from similarities dict then
+            # fill its slot with NaN, indicating missing data
+            similarity = curried_sims_dict.get(a, {}).get(b, np.nan)
+            sims_incomplete_matrix[(ai, bi)] = similarity
+    return allele_list, allele_order, sims_incomplete_matrix
+
+
+def save_heatmap(matrix, allele_names, filename):
+    seaborn.set_context("paper")
+
+    with seaborn.plotting_context(font_scale=1):
+        figure = seaborn.plt.figure(figsize=(20, 18))
+        ax = figure.add_axes()
+        heatmap = seaborn.heatmap(
+            data=matrix,
+            xticklabels=allele_names,
+            yticklabels=allele_names,
+            linewidths=0,
+            annot=False,
+            ax=ax,
+            fmt=".2g")
+        heatmap.set_xticklabels(labels=allele_names, rotation=45)
+        heatmap.set_yticklabels(labels=allele_names, rotation=45)
+        figure.savefig(filename)
+
+
+def fill_in_similarities(
+        curried_raw_sims_dict,
+        allele_to_peptide_to_affinity,
+        raw_sims_heatmap_path=None,
+        complete_sims_heatmap_path=None,
+        curried_overlap_weights=None,
+        scalar_error_tolerance=0.0001):
+    """
+    Given an incomplete dictionary of pairwise allele similarities and
+    a dictionary of binding data, generate the completed similarities
+    """
+
+    allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix(
+        allele_to_peptide_to_affinity,
+        curried_sims_dict=curried_raw_sims_dict)
+
+    missing = np.isnan(sims_matrix)
+
+    if curried_overlap_weights:
+        error_tolerance = np.ones_like(sims_matrix)
+        for allele_a, a_dict in curried_overlap_weights.items():
+            for allele_b, weight in a_dict.items():
+                i = allele_order[allele_a]
+                j = allele_order[allele_b]
+                error_tolerance[i, j] = 2.0 ** -weight
+        print(
+            "-- Error tolerance distribution: min %f, max %f, median %f" % (
+                error_tolerance[~missing].min(),
+                error_tolerance[~missing].max(),
+                np.median(error_tolerance[~missing])))
+    else:
+        error_tolerance = scalar_error_tolerance
+
+    if raw_sims_heatmap_path:
+        save_heatmap(
+            sims_matrix,
+            allele_list,
+            raw_sims_heatmap_path)
+
+    print("Completing %s similarities matrix with %d missing entries" % (
+        sims_matrix.shape,
+        missing.sum()))
+
+    solver = NuclearNormMinimization(
+        require_symmetric_solution=True,
+        min_value=0.0,
+        max_value=1.0,
+        error_tolerance=error_tolerance)
+
+    sims_complete_matrix = solver.complete(sims_matrix)
+
+    if complete_sims_heatmap_path:
+        save_heatmap(
+            sims_complete_matrix,
+            allele_list,
+            complete_sims_heatmap_path)
+
+    complete_sims_dict = curry_dictionary(
+        matrix_to_dictionary(
+            sims=sims_complete_matrix,
+            allele_list=allele_list))
+    return complete_sims_dict
+
+
+def save_csv(filename, sims, overlap_counts, overlap_weights):
+    """
+    Save pairwise allele similarities in CSV file.
+
+    Assumes the dictionaries sims, overlap_counts, and overlap_weights are
+    'curried', meaning that their keys are single allele names which map
+    to nested dictionaries from other allele names to numeric values.
+    As a type signature, this means the dictionaries are of the form
+        (allele -> allele -> number)
+    and not
+        (allele, allele) -> number
+    """
+    with open(filename, "w") as f:
+        f.write("allele_A,allele_B,similarity,count,weight\n")
+        for (a, a_row) in sorted(sims.items()):
+            a_counts = overlap_counts.get(a, {})
+            a_weights = overlap_weights.get(a, {})
+            for (b, similarity) in sorted(a_row.items()):
+                count = a_counts.get(b, 0)
+                weight = a_weights.get(b, 0.0)
+                f.write("%s,%s,%0.4f,%d,%0.4f\n" % (
+                    a,
+                    b,
+                    similarity,
+                    count,
+                    weight))
+
+
+def compute_allele_similarities(allele_to_peptide_to_affinity, min_weight=0.1):
+    """
+    Compute pairwise allele similarities from binding data,
+    complete similarity matrix for alleles which lack sufficient data,
+    returns:
+        - dictionary of allele similarities
+        - dictionary of # overlapping peptides between alleles
+        - dictionary of "weight" of overlapping peptides between alleles
+    """
+    assert isinstance(allele_to_peptide_to_affinity, dict), \
+        "Wrong type, expected dict but got %s" % (
+            type(allele_to_peptide_to_affinity),)
+    raw_sims_dict, overlap_counts, overlap_weights = \
+        compute_partial_similarities_from_peptide_overlap(
+            allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+            min_weight=min_weight)
+
+    complete_sims_dict = fill_in_similarities(
+        curried_raw_sims_dict=raw_sims_dict,
+        allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+        curried_overlap_weights=overlap_weights)
+
+    return complete_sims_dict, overlap_counts, overlap_weights
--- a/experiments/best-synthetic-data-hyperparams.py
+++ b/experiments/best-synthetic-data-hyperparams.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+Find best smoothing coefficient for creating synthetic data. The smoothing
+coefficient adds artifical weight toward low binding affinities, which dominate
+in cases where no similar allele has values for some peptide.
+
+We're determining "best" as most predictive of already known binding affinities,
+averaged across all given alleles.
+"""
+
+import argparse
+from collections import defaultdict
+
+from mhcflurry.data import load_allele_dicts
+from scipy import stats
+import numpy as np
+import sklearn.metrics
+
+from dataset_paths import PETERS2009_CSV_PATH
+from synthetic_data import (
+    synthesize_affinities_for_single_allele,
+    create_reverse_lookup_from_simple_dicts
+)
+from allele_similarities import compute_allele_similarities
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--smoothing-coefs",
+    default=[0.05, 0.025, 0.01, 0.005, 0.0025, 0.001, 0.0005, 0.0001, 0],
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
+    help="Smoothing value used for peptides with low weight across alleles")
+
+parser.add_argument(
+    "--similarity-exponents",
+    default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
+    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
+
+
+def generate_cross_validation_datasets(
+        allele_to_peptide_to_affinity,
+        n_folds=4):
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
+        peptides = list(dataset.keys())
+        affinities = list(dataset.values())
+        n_samples = len(peptides)
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
+            continue
+
+        kfold_iterator = enumerate(
+            KFold(n_samples, n_folds=n_folds, shuffle=True))
+
+        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            train_peptides = [peptides[i] for i in train_indices]
+            train_affinities = [affinities[i] for i in train_indices]
+            test_peptide_set = set([peptides[i] for i in test_indices])
+            # copy the affinity data for all alleles other than this one
+            fold_affinity_dict = {
+                allele_key: affinity_dict
+                for (allele_key, affinity_dict)
+                in allele_to_peptide_to_affinity.items()
+                if allele_key != allele
+            }
+            # include an affinity dictionary for this allele which
+            # only uses training data
+            fold_affinity_dict[allele] = {
+                train_peptide: train_affinity
+                for (train_peptide, train_affinity)
+                in zip(train_peptides, train_affinities)
+            }
+            allele_similarities, overlap_counts, overlap_weights = \
+                compute_allele_similarities(
+                    allele_to_peptide_to_affinity=fold_affinity_dict,
+                    min_weight=0.1)
+            this_allele_similarities = allele_similarities[allele]
+
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
+
+
+def evaluate_synthetic_data(
+        allele_to_peptide_to_affinity,
+        smoothing_coef,
+        exponent,
+        max_ic50,
+        n_folds=4):
+    """
+    Use cross-validation over entries of each allele to determine the predictive
+    accuracy of data synthesis on known affinities.
+    """
+
+    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
+        allele_to_peptide_to_affinity)
+
+    taus = defaultdict(list)
+    f1_scores = defaultdict(list)
+    aucs = defaultdict(list)
+
+    for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
+            generate_cross_validation_datasets(
+                allele_to_peptide_to_affinity,
+                n_folds=n_folds):
+        # create a peptide -> list[(allele, affinity, weight)] dictionary
+        # restricted only to the peptides for which we want to test accuracy
+        test_reverse_lookup = {
+            peptide: triplets
+            for (peptide, triplets) in peptide_to_affinities.items()
+            if peptide in fold_test_peptides
+        }
+
+        synthetic_values = synthesize_affinities_for_single_allele(
+            similarities=fold_sims,
+            peptide_to_affinities=test_reverse_lookup,
+            smoothing=smoothing_coef,
+            exponent=exponent,
+            exclude_alleles=[allele])
+
+        synthetic_peptide_set = set(synthetic_values.keys())
+        # set of peptides for which we have both true and synthetic
+        # affinity values
+        combined_peptide_set = synthetic_peptide_set.intersection(
+            fold_test_peptides)
+
+        combined_peptide_list = list(sorted(combined_peptide_set))
+
+        if len(combined_peptide_list) < 2:
+            print("Too few peptides in combined set %s for fold %d/%d of %s" % (
+                combined_peptide_list,
+                fold_num + 1,
+                n_folds,
+                allele))
+            continue
+
+        synthetic_affinity_list = [
+            synthetic_values[peptide]
+            for peptide in combined_peptide_list
+        ]
+
+        true_affinity_list = [
+            allele_to_peptide_to_affinity[allele][peptide]
+            for peptide in combined_peptide_list
+        ]
+        assert len(true_affinity_list) == len(synthetic_affinity_list)
+        if all(x == true_affinity_list[0] for x in true_affinity_list):
+            continue
+
+        tau, _ = stats.kendalltau(
+            synthetic_affinity_list,
+            true_affinity_list)
+        assert not np.isnan(tau)
+        print("==> %s (CV fold %d/%d) tau = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            tau))
+        taus[allele].append(tau)
+
+        true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
+        predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
+
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        print("==> %s (CV fold %d/%d) AUC = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            auc))
+        aucs[allele].append(auc)
+
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        print("==> %s (CV fold %d/%d) F1 = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            f1_score))
+        f1_scores[allele].append(f1_score)
+
+    return taus, aucs, f1_scores
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        regression_output=True)
+
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_to_peptide_to_affinity.values()
+    )
+    print("Loaded %d binding values for %d alleles" % (
+        n_binding_values, len(allele_to_peptide_to_affinity)))
+
+    results = {}
+    for smoothing_coef in args.smoothing_coefs:
+        for exponent in args.similarity_exponents:
+            taus, aucs, f1_scores = evaluate_synthetic_data(
+                allele_to_peptide_to_affinity=allele_to_peptide_to_affinity,
+                smoothing_coef=smoothing_coef,
+                exponent=exponent,
+                max_ic50=args.max_ic50)
+            allele_keys = list(sorted(taus.keys()))
+            tau_allele_means = [
+                np.mean(taus[allele])
+                for allele in allele_keys
+                if taus[allele]
+            ]
+            auc_allele_means = [
+                np.mean(aucs[allele])
+                for allele in allele_keys
+                if aucs[allele]
+            ]
+
+            f1_score_means = [
+                np.mean(f1_scores[allele])
+                for allele in allele_keys
+                if f1_scores[allele]
+            ]
+
+            median_tau = np.median(tau_allele_means)
+            median_f1 = np.median(f1_score_means)
+            median_auc = np.median(auc_allele_means)
+            print("\n\n::::::\n")
+            print(
+                "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
+                    exponent,
+                    smoothing_coef,
+                    median_tau,
+                    median_auc,
+                    median_f1))
+            print("\n^^^^^^\n")
+
+            scores = (median_tau, median_auc, median_f1)
+            results[(exponent, smoothing_coef)] = scores
+
+    print("===")
+
+    def combine_item_scores(x):
+        return x[1][0] * x[1][1] * x[1][2]
+    for ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) in \
+            sorted(results.items(), key=combine_item_scores):
+        print("-- exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+            best_exponent,
+            best_coef,
+            median_tau,
+            median_auc,
+            median_f1))
+
+    ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
+        results.items(),
+        key=combine_item_scores)
+    print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+        best_exponent,
+        best_coef,
+        median_tau,
+        median_auc,
+        median_f1))
--- a/experiments/common.py
+++ b/experiments/common.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import defaultdict
+import pandas as pd
+
+
+def curry_dictionary(key_pair_dict, default_value=0.0):
+    """
+    Transform dictionary from pairs of keys to dict -> dict -> float
+    """
+    result = defaultdict(dict)
+    for (a, b), value in key_pair_dict.items():
+        result[a][b] = value
+    return result
+
+
+def uncurry_dictionary(curried_dict):
+    """
+    Transform dictionary from (key_a -> key_b -> float) to
+    (key_a, key_b) -> float
+    """
+    result = {}
+    for a, a_dict in curried_dict.items():
+        for b, value in a_dict.items():
+            result[(a, b)] = value
+    return result
+
+
+def matrix_to_dictionary(sims, allele_list):
+    sims_dict = {}
+    for i in range(sims.shape[0]):
+        a = allele_list[i]
+        for j in range(sims.shape[1]):
+            b = allele_list[j]
+            sims_dict[a, b] = sims[i, j]
+    return sims_dict
+
+
+def load_csv_binding_data_as_dict(
+        csv_path,
+        mhc_column_name="mhc",
+        peptide_column_name="sequence",
+        ic50_column_name="ic50"):
+    """
+    Given a path to a CSV file containing peptide-MHC binding data,
+    load it as a dictionary mapping alleles to a dictionary peptide->IC50
+    """
+    df = pd.read_csv(csv_path)
+    print("-- Read %d rows from %s" % (len(df), csv_path))
+    return {
+        allele: {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(group[peptide_column_name], group[ic50_column_name])
+        }
+        for allele, group in df.groupby(mhc_column_name)
+    }
--- a/experiments/compute-allele-similarities.py
+++ b/experiments/compute-allele-similarities.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse

-import fancyimpute
-import numpy as np
 import mhcflurry

 from dataset_paths import PETERS2009_CSV_PATH
+from allele_similarities import (
+    compute_partial_similarities_from_peptide_overlap,
+    fill_in_similarities,
+    save_csv
+)

 parser = argparse.ArgumentParser()

@@ -13,204 +32,96 @@ parser.add_argument(
    default=PETERS2009_CSV_PATH)

 parser.add_argument(
-    "--fill-missing-values",
-    action="store_true",
-    default=False)
+    "--min-overlap-weight",
+    default=0.05,
+    help="Minimum overlap weight between pair of alleles",
+    type=float)

+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)

 parser.add_argument(
-    "--min-overlap-weight",
-    default=2.0,
-    help="Minimum overlap weight between pair of alleles")
+    "--only-human",
+    default=False,
+    action="store_true")

 parser.add_argument(
-    "--max-ic50",
-    default=50000.0)
+    "--raw-similarities-output-path",
+    help="CSV file which contains incomplete allele similarities")

 parser.add_argument(
-    "--output-csv",
-    required=True)
-
-def binary_encode(X, n_indices=20):
-    n_cols = X.shape[1]
-    X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float)
-    for i in range(len(X)):
-        for col_idx in range(n_cols):
-            X_encode[i, col_idx*n_indices + X[i, col_idx]] = True
-    return X_encode
-
-def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5):
-    if allele_name_length:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length}
-    if prefix:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)}
-    if len(datasets_dict) == 0:
-        raise ValueError("No alleles matched criteria")
-    all_peptides = set([])
-    allele_list = []
-    for k,d in sorted(datasets_dict.items()):
-        print(k, len(d.peptides))
-        allele_list.append(k)
-        for p in d.peptides:
-            all_peptides.add(p)
-    peptide_list = list(sorted(all_peptides))
-    allele_order = {a:i for (i,a) in enumerate(allele_list)}
-    peptide_order = {p:i for (i,p) in enumerate(peptide_list)}
-    n_alleles = len(allele_order)
-    n_peptides = len(peptide_order)
-    incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan
-    for k,d in datasets_dict.items():
-        if k not in allele_order:
-            continue
-        i = allele_order[k]
-        for p,y in zip(d.peptides, d.Y):
-            j = peptide_order[p]
-            incomplete_affinity_matrix[i,j] = y
-    print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides))
-    return incomplete_affinity_matrix, allele_order, peptide_order
-
-def get_extra_data(allele, train_datasets, expanded_predictions):
-    original_dataset = train_datasets[allele]
-    original_peptides = set(original_dataset.peptides)
-    expanded_allele_affinities = expanded_predictions[allele]
-    extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides}
-    extra_peptides = list(extra_affinities.keys())
-    extra_values = list(extra_affinities.values())
-    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
-    extra_Y = np.array(extra_values)
-    return extra_X, extra_Y
-
-def WHO_KNOWS_WHAT():
-
-
-    smoothing = 0.005
-    exponent = 2
-    all_predictions = defaultdict(dict)
-    for allele, allele_idx in allele_order.items():
-        for peptide in peptide_order.keys():
-            predictions = reverse_lookups[peptide]
-            total_value = 0.0
-            total_denom = 0.0
-            for (other_allele, y) in predictions:
-                key = (allele,other_allele)
-                if key not in completed_sims_dict:
-                    continue
-                sim = completed_sims_dict[key]
-                weight = sim ** exponent
-                total_value += weight * y
-                total_denom += weight
-            if total_denom > 0.0:
-                all_predictions[allele][peptide] = total_value / (smoothing + total_denom)
-
-
-def data_augmentation(X, Y, extra_X, extra_Y,
-                      fraction=0.5,
-                      niters=10,
-                      extra_sample_weight=0.1,
-                      nb_epoch=50,
-                      nn=True,
-                      hidden_layer_size=5):
-    n = len(Y)
-    aucs = []
-    f1s = []
-    n_originals = []
-    for _ in range(niters):
-        mask = np.random.rand(n) <= fraction
-        X_train = X[mask]
-        X_test = X[~mask]
-        Y_train = Y[mask]
-        Y_test = Y[~mask]
-        test_ic50 = 20000 ** (1-Y_test)
-        test_label = test_ic50 <= 500
-        if test_label.all() or not test_label.any():
-            continue
-        n_original = mask.sum()
-        print("Keeping %d original training samples" % n_original)
-        X_train_combined = np.vstack([X_train, extra_X])
-        Y_train_combined = np.concatenate([Y_train, extra_Y])
-        print("Combined shape: %s" % (X_train_combined.shape,))
-        assert len(X_train_combined) == len(Y_train_combined)
-        # initialize weights to count synthesized and actual data equally
-        # but weight on synthesized points will decay across training epochs
-        weight = np.ones(len(Y_train_combined))
-        if nn:
-            model = mhcflurry.feedforward.make_embedding_network(
-                layer_sizes=[hidden_layer_size],
-                embedding_output_dim=10,
-                activation="tanh")
-            for i in range(nb_epoch):
-                # decay weight as training progresses
-                weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2
-                model.fit(
-                    X_train_combined,
-                    Y_train_combined,
-                    sample_weight=weight,
-                    shuffle=True,
-                    nb_epoch=1,
-                    verbose=0)
-            pred = model.predict(X_test)
-        else:
-            model = sklearn.linear_model.Ridge(alpha=5)
-            X_train_combined_binary = binary_encode(X_train_combined)
-            X_test_binary = binary_encode(X_test)
-            model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight)
-            pred = model.predict(X_test_binary)
-        pred_ic50 = 20000 ** (1-pred)
-        pred_label =  pred_ic50 <= 500
-        mse = sklearn.metrics.mean_squared_error(Y_test, pred)
-        auc = sklearn.metrics.roc_auc_score(test_label, pred)
-
-        if pred_label.all() or not pred_label.any():
-            f1 = 0
-        else:
-            f1 = sklearn.metrics.f1_score(test_label, pred_label)
-        print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1))
-        n_originals.append(n_original)
-        aucs.append(auc)
-        f1s.append(f1)
-    return aucs, f1s, n_originals
+    "--complete-similarities-output-path",
+    help="CSV file which contains allele similarities after matrix completion")
+
+parser.add_argument(
+    "--raw-heatmap-output-path",
+    help="PNG file to save heatmap of incomplete similarities matrix")
+
+parser.add_argument(
+    "--complete-heatmap-output-path",
+    help="PNG file to save heatmap of complete similarities matrix")
+
+
+def print_dataset_sizes(allele_groups):
+    print("---\nDataset Sizes\n---")
+    for (allele_name, g) in sorted(allele_groups.items()):
+        print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % (
+            allele_name,
+            len(g),
+            sum(len(k) == 8 for k in g.keys()),
+            sum(len(k) == 9 for k in g.keys()),
+            sum(len(k) == 10 for k in g.keys()),
+            sum(len(k) == 11 for k in g.keys()),
+        ))
+    print("---")


 if __name__ == "__main__":
    args = parser.parse_args()
-
-    datasets = mhcflurry.data_helpers.load_data(
+    print(args)
+    df, peptide_column_name = mhcflurry.data.load_dataframe(
        args.binding_data_csv,
-        binary_encoding=True,
-        max_ic50=args.max_ic50)
-
-    sims = {}
-    overlaps = {}
-    weights = {}
-    for allele_name_a, da in train_datasets.items():
-        if len(allele_name_a) != 5:
-            continue
-        y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)}
-        seta = set(da.peptides)
-        for allele_name_b, db in train_datasets.items():
-            if len(allele_name_b) != 5:
-                continue
-            y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)}
-            setb = set(db.peptides)
-            intersection = seta.intersection(setb)
-            overlaps[a,b] = len(intersection)
-            total = 0.0
-            weight = 0.0
-            for peptide in intersection:
-                ya = y_dict_a[peptide]
-                yb = y_dict_b[peptide]
-                minval = min(ya, yb)
-                maxval = max(ya, yb)
-                total += minval
-                weight += maxval
-            weights[a,b] = weight
-            if weight > min_weight:
-                sims[allele_name_a, allele_name_b] = total / weight
-            else:
-                sims[allele_name_a, allele_name_b] = np.nan
-
-    if args.fill_missing_values:
-        sims_array = np.zeros((n_alleles, n_alleles), dtype=float)
-        for i, a in enumerate(allele_names):
-            for j, b in enumerate(allele_names):
-                sims_array[i,j] = sims[(a,b)]
+        max_ic50=args.max_ic50,
+        only_human=args.only_human)
+    allele_groups = {
+        allele_name: {
+            row[peptide_column_name]: row["regression_output"]
+            for (_, row) in group.iterrows()
+        }
+        for (allele_name, group) in df.groupby("mhc")
+    }
+    print_dataset_sizes(allele_groups)
+
+    raw_sims_dict, overlap_counts, overlap_weights = \
+        compute_partial_similarities_from_peptide_overlap(
+            allele_groups,
+            min_weight=args.min_overlap_weight)
+
+    if args.raw_similarities_output_path:
+        save_csv(
+            args.raw_similarities_output_path,
+            raw_sims_dict,
+            overlap_counts,
+            overlap_weights)
+
+    complete_sims_dict = fill_in_similarities(
+        curried_raw_sims_dict=raw_sims_dict,
+        allele_to_peptide_to_affinity=allele_groups,
+        raw_sims_heatmap_path=args.raw_heatmap_output_path,
+        complete_sims_heatmap_path=args.complete_heatmap_output_path)
+
+    n_total_entries = sum(len(row) for row in complete_sims_dict.values())
+    n_original_entries = sum(len(row) for row in raw_sims_dict.values())
+    print("-- Added %d/%d allele similarities" % (
+        n_total_entries - n_original_entries,
+        n_total_entries
+    ))
+
+    if args.complete_similarities_output_path:
+        save_csv(
+            args.complete_similarities_output_path,
+            complete_sims_dict,
+            overlap_counts,
+            overlap_weights)
--- a/experiments/cross_validation_affinity_data.py
+++ b/experiments/cross_validation_affinity_data.py
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sklearn.cross_validation import KFold
+
+
+def generate_cross_validation_datasets(
+        allele_to_peptide_to_affinity,
+        n_folds=4):
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
+        peptides = list(dataset.keys())
+        affinities = list(dataset.values())
+        n_samples = len(peptides)
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
+            continue
+
+        kfold_iterator = enumerate(
+            KFold(n_samples, n_folds=n_folds, shuffle=True))
+
+        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            train_peptides = [peptides[i] for i in train_indices]
+            train_affinities = [affinities[i] for i in train_indices]
+            test_peptide_set = set([peptides[i] for i in test_indices])
+            # copy the affinity data for all alleles other than this one
+            fold_affinity_dict = {
+                allele_key: affinity_dict
+                for (allele_key, affinity_dict)
+                in allele_to_peptide_to_affinity.items()
+                if allele_key != allele
+            }
+            # include an affinity dictionary for this allele which
+            # only uses training data
+            fold_affinity_dict[allele] = {
+                train_peptide: train_affinity
+                for (train_peptide, train_affinity)
+                in zip(train_peptides, train_affinities)
+            }
+            allele_similarities, overlap_counts, overlap_weights = \
+                compute_allele_similarities(
+                    allele_to_peptide_to_affinity=fold_affinity_dict,
+                    min_weight=0.1)
+            this_allele_similarities = allele_similarities[allele]
+
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
\ No newline at end of file
--- a/experiments/matrix-completion-accuracy.py
+++ b/experiments/matrix-completion-accuracy.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from fancyimpute import (
+    SoftImpute,
+    IterativeSVD,
+    SimilarityWeightedAveraging,
+    KNN,
+    MICE,
+    BiScaler,
+    SimpleFill,
+)
+import numpy as np
+import pandas as pd
+
+from dataset_paths import PETERS2009_CSV_PATH
+from score_set import ScoreSet
+from matrix_completion_helpers import (
+    load_data,
+    evaluate_predictions,
+    stratified_cross_validation
+)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--save-incomplete-affinity-matrix",
+    default=None,
+    help="Path to CSV which will contains the incomplete affinity matrix")
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--output-file",
+    default="matrix-completion-accuracy-results.csv")
+
+parser.add_argument(
+    "--normalize-columns",
+    default=False,
+    action="store_true",
+    help="Center and rescale columns of affinity matrix")
+
+
+parser.add_argument(
+    "--normalize-rows",
+    default=False,
+    action="store_true",
+    help="Center and rescale rows of affinity matrix")
+
+parser.add_argument(
+    "--min-observations-per-peptide",
+    type=int,
+    default=2,
+    help="Drop peptide entries with fewer than this number of measured affinities")
+
+parser.add_argument(
+    "--n-folds",
+    default=10,
+    type=int,
+    help="Number of cross-validation folds")
+
+parser.add_argument(
+    "--verbose",
+    default=False,
+    action="store_true")
+
+
+def create_imputation_methods(
+        verbose=False,
+        clip_imputed_values=False,
+        knn_print_interval=20,
+        knn_params=[1, 3, 5],
+        softimpute_params=[1, 5, 10],
+        svd_params=[5, 10, 20]):
+    min_value = 0 if clip_imputed_values else None
+    max_value = 1 if clip_imputed_values else None
+    result_dict = {
+        "meanFill": SimpleFill("mean"),
+        "zeroFill": SimpleFill("zero"),
+        "mice": MICE(
+            n_burn_in=5,
+            n_imputations=25,
+            min_value=min_value,
+            max_value=max_value,
+            verbose=verbose),
+        "similarityWeightedAveraging": SimilarityWeightedAveraging(
+            orientation="columns",
+            verbose=verbose),
+    }
+    for threshold in softimpute_params:
+        result_dict["softImpute-%d" % threshold] = SoftImpute(
+            threshold,
+            verbose=verbose,
+            min_value=min_value,
+            max_value=max_value)
+    for rank in svd_params:
+        result_dict["svdImpute-%d" % rank] = IterativeSVD(
+            rank,
+            verbose=verbose,
+            min_value=min_value,
+            max_value=max_value)
+    for k in knn_params:
+        result_dict["knnImpute-%d" % k] = KNN(
+            k,
+            orientation="columns",
+            verbose=verbose,
+            print_interval=knn_print_interval)
+    return result_dict
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    imputation_methods = create_imputation_methods(
+        verbose=args.verbose,
+        clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
+    )
+    print("Imputation methods: %s" % imputation_methods)
+
+    X, peptide_list, allele_list = load_data(
+        binding_data_csv=args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        min_observations_per_allele=args.n_folds,
+        min_observations_per_peptide=args.min_observations_per_peptide)
+    observed_mask = np.isfinite(X)
+    print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
+        X.shape,
+        observed_mask.sum(),
+        X.size,
+        100.0 * observed_mask.sum() / X.size))
+    if args.save_incomplete_affinity_matrix:
+        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
+        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
+        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
+
+    scores = ScoreSet()
+    kfold = stratified_cross_validation(
+        X=X,
+        observed_mask=observed_mask,
+        n_folds=args.n_folds)
+    for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
+        X_fold_reduced = X_fold[ok_mesh]
+        biscaler = BiScaler(
+            scale_rows=args.normalize_rows,
+            center_rows=args.normalize_rows,
+            scale_columns=args.normalize_columns,
+            center_columns=args.normalize_columns)
+        X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
+        for (method_name, solver) in sorted(imputation_methods.items()):
+            print("CV fold %d/%d, running %s" % (
+                fold_idx + 1,
+                args.n_folds,
+                method_name))
+            X_completed_reduced_scaled = solver.complete(X_fold_reduced)
+            X_completed_reduced = biscaler.inverse_transform(
+                X_completed_reduced_scaled)
+            X_completed = np.zeros_like(X)
+            X_completed[ok_mesh] = X_completed_reduced
+            y_pred = X_completed[test_coords]
+            mae, tau, auc, f1_score = evaluate_predictions(
+                y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50)
+            scores.add_many(
+                method_name,
+                mae=mae,
+                tau=tau,
+                f1_score=f1_score,
+                auc=auc)
+    scores.to_csv(args.output_file)
--- a/experiments/matrix-completion-hyperparameter-search.py
+++ b/experiments/matrix-completion-hyperparameter-search.py
--- a/experiments/matrix_completion_helpers.py
+++ b/experiments/matrix_completion_helpers.py
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from fancyimpute.dictionary_helpers import (
+    dense_matrix_from_nested_dictionary,
+    transpose_nested_dictionary,
+)
+from mhcflurry.data import load_allele_dicts
+import sklearn.metrics
+from sklearn.cross_validation import StratifiedKFold
+from scipy import stats
+import numpy as np
+
+
+def evaluate_predictions(
+        y_true,
+        y_pred,
+        max_ic50):
+    """
+    Return mean absolute error, Kendall tau, AUC and F1 score
+    """
+    if len(y_pred) != len(y_true):
+        raise ValueError("y_pred must have same number of elements as y_true")
+
+    mae = np.mean(np.abs(y_true - y_pred))
+
+    # if all predictions are the same
+    if (y_pred[0] == y_pred).all():
+        logging.warn("All predicted values were the same: %f" % y_pred[0])
+        return (mae, 0, 0.5, 0)
+
+    tau, _ = stats.kendalltau(
+        y_pred,
+        y_true)
+    if np.isnan(tau):
+        logging.warn("Value for Kendall's tau was NaN (n=%d)" % (len(y_true),))
+        tau = 0.0
+
+    true_ic50s = max_ic50 ** (1.0 - np.array(y_true))
+    predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred))
+
+    true_binding_label = true_ic50s <= 500
+    if true_binding_label.all():
+        logging.warn("All true labels are binders, can't compute AUC")
+        return (mae, tau, 0.5, 0)
+    elif not true_binding_label.any():
+        logging.warn("No true labels are binders, can't compute AUC")
+        # can't compute AUC or F1 without both negative and positive cases
+        return (mae, tau, 0.5, 0)
+
+    auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred)
+
+    predicted_binding_label = predicted_ic50s <= 500
+    if predicted_binding_label.all():
+        logging.warn("All predicted labels are binders, can't compute F1")
+        return (mae, tau, auc, 0)
+    elif not predicted_binding_label.any():
+        logging.warn("No predicted labels are binders, can't compute F1")
+        # can't compute F1 without both positive and negative predictions
+        return (mae, tau, auc, 0)
+
+    f1_score = sklearn.metrics.f1_score(
+        true_binding_label,
+        predicted_binding_label)
+
+    return mae, tau, auc, f1_score
+
+
+def prune_data(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=1,
+        min_observations_per_allele=1):
+    observed_mask = np.isfinite(X)
+    n_observed_per_peptide = observed_mask.sum(axis=1)
+    too_few_peptide_observations = (
+        n_observed_per_peptide < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_peptide_indices = np.where(too_few_peptide_observations)[0]
+        keep_peptide_indices = np.where(~too_few_peptide_observations)[0]
+        print("Dropping %d peptides with <%d observations" % (
+            len(drop_peptide_indices),
+            min_observations_per_peptide))
+        X = X[keep_peptide_indices]
+        observed_mask = observed_mask[keep_peptide_indices]
+        peptide_list = [peptide_list[i] for i in keep_peptide_indices]
+
+    n_observed_per_allele = observed_mask.sum(axis=0)
+    too_few_allele_observations = (
+        n_observed_per_allele < min_observations_per_peptide)
+    if too_few_peptide_observations.any():
+        drop_allele_indices = np.where(too_few_allele_observations)[0]
+        keep_allele_indices = np.where(~too_few_allele_observations)[0]
+
+        print("Dropping %d alleles with <%d observations: %s" % (
+            len(drop_allele_indices),
+            min_observations_per_allele,
+            [allele_list[i] for i in drop_allele_indices]))
+        X = X[:, keep_allele_indices]
+        observed_mask = observed_mask[:, keep_allele_indices]
+        allele_list = [allele_list[i] for i in keep_allele_indices]
+    return X, peptide_list, allele_list
+
+
+def load_data(
+        binding_data_csv,
+        max_ic50,
+        only_human=False,
+        min_observations_per_allele=1,
+        min_observations_per_peptide=1):
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        binding_data_csv,
+        max_ic50=max_ic50,
+        only_human=only_human,
+        regression_output=True)
+    peptide_to_allele_to_affinity = transpose_nested_dictionary(
+        allele_to_peptide_to_affinity)
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_to_peptide_to_affinity.values()
+    )
+    print("Loaded %d binding values for %d alleles" % (
+        n_binding_values,
+        len(allele_to_peptide_to_affinity)))
+    X, peptide_list, allele_list = \
+        dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
+    return prune_data(
+        X,
+        peptide_list,
+        allele_list,
+        min_observations_per_peptide=min_observations_per_peptide,
+        min_observations_per_allele=min_observations_per_allele)
+
+
+def index_counts(indices):
+    max_index = indices.max()
+    counts = np.zeros(max_index + 1, dtype=int)
+    for index in indices:
+        counts[index] += 1
+    return counts
+
+
+def stratified_cross_validation(X, observed_mask, n_folds=10):
+    n_observed = observed_mask.sum()
+
+    (observed_peptide_index, observed_allele_index) = np.where(observed_mask)
+    observed_indices = np.ravel_multi_index(
+        (observed_peptide_index, observed_allele_index),
+        dims=observed_mask.shape)
+
+    assert len(observed_indices) == n_observed
+
+    observed_allele_counts = observed_mask.sum(axis=0)
+    print("# observed per allele: %s" % (observed_allele_counts,))
+    assert (index_counts(observed_allele_index) == observed_allele_counts).all()
+
+    kfold = StratifiedKFold(
+        observed_allele_index,
+        n_folds=n_folds,
+        shuffle=True)
+
+    for (_, indirect_test_indices) in kfold:
+        test_linear_indices = observed_indices[indirect_test_indices]
+        test_coords = np.unravel_index(
+            test_linear_indices,
+            dims=observed_mask.shape)
+
+        test_allele_counts = index_counts(test_coords[1])
+        allele_fractions = test_allele_counts / observed_allele_counts.astype(float)
+        print("Fraction of each allele in this CV fold: %s" % (allele_fractions,))
+
+        X_test_vector = X[test_coords]
+
+        X_fold = X.copy()
+        X_fold[test_coords] = np.nan
+
+        empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0
+        ok_row_mask = ~empty_row_mask
+        ok_row_indices = np.where(ok_row_mask)[0]
+
+        empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0
+        ok_col_mask = ~empty_col_mask
+        ok_col_indices = np.where(ok_col_mask)[0]
+
+        ok_mesh = np.ix_(ok_row_indices, ok_col_indices)
+
+        print("Dropping %d empty rows, %d empty columns" % (
+            empty_row_mask.sum(),
+            empty_col_mask.sum()))
+        yield (X_fold, ok_mesh, test_coords, X_test_vector)
--- a/experiments/model-selection.py
+++ b/experiments/model-selection.py
@@ -143,6 +143,8 @@ parser.add_argument(
    help="Comma separated list of optimization methods")


+
+
 if __name__ == "__main__":
    args = parser.parse_args()
    configs = generate_all_model_configs(

--- a/experiments/synthesize-augmented-dataset.py
+++ b/experiments/synthesize-augmented-dataset.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+import argparse
+
+import pandas as pd
+
+from dataset_paths import PETERS2009_CSV_PATH
+from allele_similarities import compute_allele_similarities
+from synthetic_data import (
+    create_reverse_lookup_from_simple_dicts,
+    synthesize_affinities_for_all_alleles,
+    load_sims_dict
+)
+
+from mhcflurry.data import load_allele_dicts
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--output-csv",
+    help="Path for CSV containing synthesized dataset of affinities",
+    required=True)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--allele-similarity-csv",
+    required=False)
+
+parser.add_argument(
+    "--smoothing-coef",
+    default=0.001,
+    type=float,
+    help="Smoothing value used for peptides with low weight across alleles")
+
+parser.add_argument(
+    "--similarity-exponent",
+    default=2.0,
+    type=float,
+    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        regression_output=True)
+
+    print("Loaded binding data for %d alleles" % (
+        len(allele_to_peptide_to_affinity),))
+
+    reverse_lookup = create_reverse_lookup_from_simple_dicts(
+        allele_to_peptide_to_affinity)
+    print("Created reverse lookup dictionary for %d peptides" % len(reverse_lookup))
+
+    if args.allele_similarity_csv:
+        sims_dict = load_sims_dict(
+            args.allele_similarity_csv,
+            allele_pair_keys=False)
+        allele_names = list(sims_dict.keys())
+        print("Loaded similarities between %d alleles from %s" % (
+            len(allele_names),
+            args.allele_similarity_csv))
+    else:
+        sims_dict, _, _ = \
+            compute_allele_similarities(
+                allele_to_peptide_to_affinity,
+                min_weight=0.1)
+
+    synthetic_data = synthesize_affinities_for_all_alleles(
+        peptide_to_affinities=reverse_lookup,
+        pairwise_allele_similarities=sims_dict,
+        smoothing=args.smoothing_coef,
+        exponent=args.similarity_exponent,
+        allele_pair_keys=False)
+
+    synthetic_data_dict = OrderedDict([
+        ("mhc", []),
+        ("sequence", []),
+        ("ic50", []),
+    ])
+
+    for allele, allele_entries in sorted(synthetic_data.items()):
+        print("%s (n=%d)" % (allele, len(allele_entries)))
+        for (peptide, regression_value) in allele_entries.items():
+            synthetic_data_dict["mhc"].append(allele)
+            synthetic_data_dict["sequence"].append(peptide)
+            ic50 = args.max_ic50 ** (1.0 - regression_value)
+            synthetic_data_dict["ic50"].append(ic50)
+
+    synthetic_data_df = pd.DataFrame(synthetic_data_dict)
+
+    print("Created dataset with %d synthesized pMHC affinities" % (
+        len(synthetic_data_df),))
+
+    synthetic_data_df.to_csv(args.output_csv, index=False)
--- a/experiments/synthetic_data.py
+++ b/experiments/synthetic_data.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Helper functions for creating synthetic pMHC affinities from allele similarities
+and true binding data
+"""
+
+from collections import defaultdict
+
+import pandas as pd
+
+from common import curry_dictionary
+
+
+def load_sims_dict(csv_path, allele_pair_keys=True):
+    """
+    Read in allele-to-allele similarities as DataFrame
+    """
+    sims_df = pd.read_csv(csv_path)
+    # extract dictionary mapping pairs of alleles to similarity
+    sims_dict = {
+        (row["allele_A"], row["allele_B"]): row["similarity"]
+        for (_, row)
+        in sims_df.iterrows()
+    }
+    if not allele_pair_keys:
+        return curry_dictionary(sims_dict)
+    else:
+        return sims_dict
+
+
+def synthesize_affinities_for_single_allele(
+        similarities,
+        peptide_to_affinities,
+        smoothing=0.005,
+        exponent=2.0,
+        exclude_alleles=[]):
+    """
+    Parameters
+    ----------
+    similarities : dict
+        Dictionary mapping allele names to their similarity to the
+        allele of interest.
+
+    peptide_to_affinities : dict
+        Dictionary mapping peptide sequence to list of triplets containing
+        the following fields:
+            - allele
+            - log-scaled affinity normalized to 0.0 = no binding, 1.0 = strong
+            - sample weight
+
+    smoothing : float
+
+    exponent : float
+
+    exclude_alleles : collection of allele names
+        Don't use these alleles for synthesizing new affinities
+
+    Returns dictionary mapping peptide sequences to affinities.
+    """
+    assert isinstance(similarities, dict), \
+        "Wrong type for similarities: %s" % type(similarities)
+    results = {}
+    for peptide, affinities in peptide_to_affinities.items():
+        total = 0.0
+        denom = 0.0
+        for entry in affinities:
+
+            if len(entry) == 2:
+                (allele, y) = entry
+                sample_weight = 1.0
+            else:
+                assert len(entry) == 3
+                (allele, y, sample_weight) = entry
+            if allele in exclude_alleles:
+                continue
+            sim = similarities.get(allele, 0)
+            if sim == 0:
+                continue
+            combined_weight = sim ** exponent * sample_weight
+            total += combined_weight * y
+            denom += combined_weight
+        if denom > 0.0:
+            results[peptide] = total / (smoothing + denom)
+    return results
+
+
+def create_reverse_lookup_from_allele_data_objects(allele_datasets):
+    """
+    Given a dictionary mapping each allele name to an AlleleData object,
+    create reverse-lookup dictionary mapping each peptide to a list of triplets:
+        [(allele, regression_output, weight), ...]
+    """
+    peptide_affinities_dict = defaultdict(list)
+    for allele, dataset in allele_datasets.items():
+        for peptide, y, weight in zip(
+                dataset.peptides, dataset.Y, dataset.weights):
+            entry = (allele, y, weight)
+            peptide_affinities_dict[peptide].append(entry)
+    return peptide_affinities_dict
+
+
+def create_reverse_lookup_from_simple_dicts(affinities):
+    """
+    Create a lookup table from peptides to lists of (allele, affinity, weight)
+    """
+    reverse_lookup = defaultdict(list)
+    for allele, affinity_dict in affinities.items():
+        for (peptide, affinity) in affinity_dict.items():
+            reverse_lookup[peptide].append((allele, affinity, 1.0))
+    return reverse_lookup
+
+
+def synthesize_affinities_for_all_alleles(
+        peptide_to_affinities,
+        pairwise_allele_similarities,
+        allele_pair_keys=True,
+        smoothing=0.005,
+        exponent=2.0):
+    """
+    peptide_to_affinities : dict
+        Maps each peptide to list of either
+        (allele, affinity) or (allele, affinity, weight)
+
+    pairwise_allele_similarities : dict
+        Dictionary from allele -> allele -> value between 0..1
+
+    smoothing : float
+
+    exponent : float
+    """
+    if allele_pair_keys:
+        pairwise_allele_similarities = curry_dictionary(
+            pairwise_allele_similarities)
+
+    all_predictions = {}
+
+    allele_names = set(pairwise_allele_similarities.keys())
+    for allele in allele_names:
+        all_predictions[allele] = synthesize_affinities_for_single_allele(
+            similarities=pairwise_allele_similarities[allele],
+            peptide_to_affinities=peptide_to_affinities,
+            smoothing=smoothing,
+            exponent=exponent)
+    return all_predictions
--- a/experiments/train-models-with-synthetic-pretraining.py
+++ b/experiments/train-models-with-synthetic-pretraining.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import numpy as np
+
+import mhcflurry
+from mhcflurry.data import (
+    load_allele_datasets,
+    encode_peptide_to_affinity_dict,
+)
+
+
+from arg_parsing import parse_int_list, parse_float_list
+from dataset_paths import PETERS2009_CSV_PATH
+from common import load_csv_binding_data_as_dict
+from training_helpers import (
+    kfold_cross_validation_of_model_params_with_synthetic_data,
+    average_prediction_scores_list
+)
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--synthetic-data-csv",
+    required=True,
+    help="CSV with {mhc, sequence, ic50} columns of synthetic data")
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--embedding-dim-sizes",
+    default=[4, 8, 16, 32, 64],
+    type=parse_int_list)
+
+parser.add_argument(
+    "--hidden-layer-sizes",
+    default=[4, 8, 16, 32, 64, 128, 256],
+    type=parse_int_list)
+
+parser.add_argument(
+    "--dropouts",
+    default=[0.0, 0.25, 0.5],
+    type=parse_float_list)
+
+parser.add_argument(
+    "--activation-functions",
+    default=["tanh", "relu"],
+    type=lambda s: [si.strip() for si in s.split(",")])
+
+parser.add_argument(
+    "--training-epochs",
+    type=int,
+    default=250)
+
+parser.add_argument("--log-file")
+
+
+def get_extra_data(allele, train_datasets, expanded_predictions):
+    original_dataset = train_datasets[allele]
+    original_peptides = set(original_dataset.peptides)
+    expanded_allele_affinities = expanded_predictions[allele]
+    extra_affinities = {
+        k: v
+        for (k, v) in expanded_allele_affinities.items()
+        if k not in original_peptides
+    }
+    extra_peptides = list(extra_affinities.keys())
+    extra_values = list(extra_affinities.values())
+    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
+    extra_Y = np.array(extra_values)
+    return extra_X, extra_Y
+
+
+def rescale_ic50(ic50, max_ic50):
+    log_ic50 = np.log(ic50) / np.log(args.max_ic50)
+    return max(0.0, min(1.0, 1.0 - log_ic50))
+
+
+def load_synthetic_data(csv_path, max_ic50):
+    synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
+        csv_path)
+    return {
+        allele: {
+            peptide: rescale_ic50(ic50, max_ic50=max_ic50)
+            for (peptide, ic50)
+            in allele_dict.items()
+        }
+        for (allele, allele_dict)
+        in synthetic_allele_to_peptide_to_ic50_dict.items()
+    }
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    print("Loading binding data from %s" % args.binding_data_csv)
+    allele_datasets = load_allele_datasets(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=False)
+    print("Loading synthetic data from %s" % args.synthetic_data_csv)
+
+    synthetic_affinities = load_synthetic_data(
+        csv_path=args.synthetic_data_csv,
+        max_ic50=args.max_ic50)
+
+    combined_allele_set = set(allele_datasets.keys()).union(
+        synthetic_affinities.keys())
+
+    combined_allele_list = list(sorted(combined_allele_set))
+
+    if args.log_file:
+        logfile = open(args.log_file, "w")
+        logfile.write("allele,n_samples,n_unique,n_synth,")
+    else:
+        logfile = None
+    logfile_needs_header = True
+
+    for allele in combined_allele_list:
+        synthetic_allele_dict = synthetic_affinities[allele]
+        (_, _, Counts_synth, X_synth, _, Y_synth) = \
+            encode_peptide_to_affinity_dict(synthetic_allele_dict)
+        synthetic_sample_weights = 1.0 / Counts_synth
+        scores = {}
+        source_peptides = allele_datasets[allele].original_peptides
+        X_original = allele_datasets[allele].X_index
+        Y_original = allele_datasets[allele].Y
+        n_samples = len(X_original)
+        n_unique_samples = len(set(source_peptides))
+        n_synth_samples = len(synthetic_sample_weights)
+        for dropout in args.dropouts:
+            for embedding_dim_size in args.embedding_dim_sizes:
+                for hidden_layer_size in args.hidden_layer_sizes:
+                    for activation in args.activation_functions:
+                        params = (
+                            ("dropout_probability", dropout),
+                            ("embedding_dim_size", embedding_dim_size),
+                            ("hidden_layer_size", hidden_layer_size),
+                            ("activation", activation),
+                        )
+
+                        print(
+                            "Evaluating allele %s (n=%d, unique=%d): %s" % (
+                                allele,
+                                n_samples,
+                                n_unique_samples,
+                                params))
+                        fold_scores = \
+                            kfold_cross_validation_of_model_params_with_synthetic_data(
+                                X_original=X_original,
+                                Y_original=Y_original,
+                                source_peptides_original=source_peptides,
+                                X_synth=X_synth,
+                                Y_synth=Y_synth,
+                                original_sample_weights=allele_datasets[allele].weights,
+                                synthetic_sample_weights=synthetic_sample_weights,
+                                n_training_epochs=args.training_epochs,
+                                max_ic50=args.max_ic50,
+                                **dict(params))
+                        average_scores = average_prediction_scores_list(fold_scores)
+                        if logfile:
+                            if logfile_needs_header:
+                                for param_name, _ in params:
+                                    logfile.write("%s," % param_name)
+                                for score_name in average_scores._fields:
+                                    logfile.write("%s," % score_name)
+                                logfile.write("param_id\n")
+                                logfile_needs_header = False
+                            logfile.write("%s,%d,%d,%d," % (
+                                allele,
+                                n_samples,
+                                n_unique_samples,
+                                n_synth_samples))
+                            for _, param_value in params:
+                                logfile.write("%s," % param_value)
+                            for score_name in average_scores._fields:
+                                score_value = average_scores.__dict__[score_name]
+                                logfile.write("%0.4f," % score_value)
+                            logfile.write("%d\n" % len(scores))
+                            logfile.flush()
+
+                        scores[params] = average_scores
+                        print("%s => %s" % (params, average_scores))
+    if logfile:
+        logfile.close()
--- a/experiments/training_helpers.py
+++ b/experiments/training_helpers.py
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
+import logging
+from itertools import groupby

 import numpy as np
-from sklearn.metrics import roc_auc_score
+from sklearn import metrics
+from sklearn.cross_validation import LabelKFold
+from scipy import stats

+import mhcflurry
 from mhcflurry.common import normalize_allele_name
-from mhcflurry.data_helpers import indices_to_hotshot_encoding
+from mhcflurry.data import indices_to_hotshot_encoding
+
+
+PredictionScores = namedtuple("PredictionScores", "tau auc f1 accuracy mae")


 def encode_allele_dataset(
@@ -81,27 +89,54 @@ def encode_allele_datasets(
    return (X_dict, Y_log_ic50_dict, ic50_dict)


-def f1_score(true_label, label_pred):
-    tp = (true_label & label_pred).sum()
-    fp = ((~true_label) & label_pred).sum()
-    fn = (true_label & (~label_pred)).sum()
-    recall = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0
-    precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0
-    if (precision + recall) > 0:
-        return (2 * precision * recall) / (precision + recall)
-    else:
-        return 0.0
-
-
-def score_predictions(predicted_log_ic50, true_label, max_ic50):
-    """Computes accuracy, AUC, and F1 score of predictions"""
-    auc = roc_auc_score(true_label, predicted_log_ic50)
-    ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50)
-    label_pred = (ic50_pred <= 500)
-    same_mask = true_label == label_pred
+def score_predictions(
+        predicted_log_ic50,
+        true_log_ic50,
+        max_ic50):
+    """Computes Kendall-tau, AUC, F1 score, and accuracy of predictions
+
+    Parameters
+    ----------
+    predicted_log_ic50 : np.array
+
+    true_log_ic50 : np.array
+
+    max_ic50 : float
+
+    Returns PredictionScores object with fields (tau, auc, f1, accuracy)
+    """
+    mae = np.abs(predicted_log_ic50 - true_log_ic50).mean()
+    tau, _ = stats.kendalltau(predicted_log_ic50, true_log_ic50)
+    if np.isnan(tau):
+        logging.warn("Kendall tau was NaN!")
+        tau = 0.0
+
+    true_ic50s = max_ic50 ** (1.0 - np.array(true_log_ic50))
+    predicted_ic50s = max_ic50 ** (1.0 - np.array(predicted_log_ic50))
+
+    true_binding_label = true_ic50s <= 500
+    if true_binding_label.all() or not true_binding_label.any():
+        logging.warn(
+            ("Can't compute AUC, F1, accuracy without both"
+             " negative and positive ground truth labels"))
+        return PredictionScores(
+            tau=tau,
+            auc=0.5,
+            f1=0.0,
+            accuracy=0.0,
+            mae=mae)
+
+    auc = metrics.roc_auc_score(true_binding_label, predicted_log_ic50)
+    predicted_binding_label = predicted_ic50s <= 500
+    f1_score = metrics.f1_score(true_binding_label, predicted_binding_label)
+    same_mask = true_binding_label == predicted_binding_label
    accuracy = np.mean(same_mask)
-    f1 = f1_score(true_label, label_pred)
-    return accuracy, auc, f1
+    return PredictionScores(
+        tau=tau,
+        auc=auc,
+        f1=f1_score,
+        accuracy=accuracy,
+        mae=mae)


 def train_model_and_return_scores(
@@ -109,7 +144,7 @@ def train_model_and_return_scores(
        X_train,
        log_ic50_train,
        X_test,
-        binder_label_test,
+        log_ic50_test,
        n_training_epochs,
        minibatch_size,
        max_ic50):
@@ -120,8 +155,339 @@ def train_model_and_return_scores(
        verbose=0,
        batch_size=minibatch_size)
    pred = model.predict(X_test).flatten()
-    accuracy, auc, f1_score = score_predictions(
+    return score_predictions(
        predicted_log_ic50=pred,
-        true_label=binder_label_test,
+        true_log_ic50=log_ic50_test,
        max_ic50=max_ic50)
-    return (accuracy, auc, f1_score)
+
+
+def train_model_with_synthetic_data(
+        model,
+        n_training_epochs,
+        X_original,
+        Y_original,
+        X_synth,
+        Y_synth,
+        original_sample_weights=None,
+        synthetic_sample_weights=None):
+    n_actual_samples, n_actual_dims = X_original.shape
+    n_synth_samples, n_synth_dims = X_synth.shape
+    if original_sample_weights is None:
+        original_sample_weights = np.ones_like(Y_original)
+
+    if synthetic_sample_weights is None:
+        synthetic_sample_weights = np.ones_like(Y_synth)
+
+    total_synth_weights = synthetic_sample_weights.sum()
+    total_original_weights = original_sample_weights.sum()
+    print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % (
+        np.mean(Y_original),
+        np.mean(Y_synth),
+        np.mean(original_sample_weights),
+        np.mean(synthetic_sample_weights)))
+
+    combined_weights = np.concatenate([
+        original_sample_weights,
+        synthetic_sample_weights
+    ])
+
+    assert n_actual_dims == n_synth_dims, \
+        "Mismatch between # of actual dims %d and synthetic dims %d" % (
+            n_actual_dims, n_synth_dims)
+
+    X_combined = np.vstack([X_original, X_synth])
+    n_combined_samples = n_actual_samples + n_synth_samples
+
+    assert X_combined.shape[0] == n_combined_samples, \
+        "Expected %d samples but got data array with shape %s" % (
+            n_actual_samples + n_synth_samples, X_combined.shape)
+
+    assert len(combined_weights) == n_combined_samples, \
+        "Expected combined_weights to have length %d but got shape = %s" % (
+            n_combined_samples,
+            combined_weights.shape)
+
+    Y_combined = np.concatenate([Y_original, Y_synth])
+    assert Y_combined.min() >= 0, \
+        "Y should not contain negative numbers! Y.min() = %f" % (
+            Y_combined.min(),)
+    assert Y_combined.max() <= 1, \
+        "Y should have max value 1.0, got Y.max() = %f" % (
+            Y_combined.max(),)
+
+    for epoch in range(n_training_epochs):
+        # weights for synthetic points can be shrunk as:
+        #  ~ 1 / (1+epoch)**2
+        # or
+        # e ** -epoch
+        decay_factor = np.exp(-epoch)
+        # if the contribution of synthetic samples is less than a
+        # thousandth of the actual data, then stop using it
+        synth_contribution = total_synth_weights * decay_factor
+        synth_fraction_contribution = synth_contribution / (
+            synth_contribution + total_original_weights)
+        # only use synthetic data if it contributes at least 1/1000th of
+        # sample weight
+        use_synth_data = synth_fraction_contribution > 0.001
+        if use_synth_data:
+            combined_weights[n_actual_samples:] = (
+                synthetic_sample_weights * decay_factor)
+            model.fit(
+                X_combined,
+                Y_combined,
+                sample_weight=combined_weights,
+                nb_epoch=1,
+                verbose=0)
+        else:
+            model.fit(
+                X_original,
+                Y_original,
+                sample_weight=original_sample_weights,
+                nb_epoch=1,
+                verbose=0)
+
+        Y_pred = model.predict(X_original)
+        training_mse = ((Y_original - Y_pred) ** 2).mean()
+
+        synth_data_percent = (
+            ((1.0 - synth_fraction_contribution) * 100)
+            if use_synth_data
+            else 100
+        )
+        real_data_percent = (
+            (synth_fraction_contribution * 100)
+            if use_synth_data else 0
+        )
+        print(
+            ("-- Epoch %d/%d real data weight %0.2f%%,"
+             " synth data weight %0.2f%%,"
+             " Training MSE %0.4f") % (
+                epoch + 1,
+                n_training_epochs,
+                real_data_percent,
+                synth_data_percent,
+                training_mse))
+
+
+def peptide_group_indices(original_peptide_sequences):
+    """
+    Given a list of peptide sequences, some of which are identical,
+    generate a sequence of index lists for the occurrence of distinct
+    entries.
+    """
+    indices_and_sequences = enumerate(original_peptide_sequences)
+    for _, subiter in groupby(indices_and_sequences, lambda pair: pair[1]):
+        yield [idx for (idx, _) in subiter]
+
+
+def collapse_peptide_group_affinities(
+        predictions,
+        true_values,
+        original_peptide_sequences,
+        combine_multiple_predictions_fn=np.median):
+    """
+    Given predictions and true log-transformed IC50 values for 9mers which may
+    have been either elongated or shortened from peptide sequences of other
+    lengths, collapse the log-transformed IC50 values to a single value
+    per original peptide sequence.
+    """
+    assert len(predictions) == len(true_values), \
+        ("Expected predictions (%d elements) to have same length"
+         " as true values (%d elements)") % (len(predictions), len(true_values))
+    assert len(predictions) == len(original_peptide_sequences), \
+        ("Expected predictions (%d elements) to have same length"
+         " as peptide sequences (%d elements)") % (
+            len(predictions), len(original_peptide_sequences))
+
+    collapsed_predictions = []
+    collapsed_true_values = []
+    for peptide_indices in peptide_group_indices(original_peptide_sequences):
+        if len(peptide_indices) == 1:
+            idx = peptide_indices[0]
+            collapsed_predictions.append(predictions[idx])
+            collapsed_true_values.append(true_values[idx])
+        else:
+            collapsed_predictions.append(
+                combine_multiple_predictions_fn(predictions[peptide_indices]))
+            collapsed_true_values.append(
+                combine_multiple_predictions_fn(
+                    true_values[peptide_indices]))
+    return collapsed_predictions, collapsed_true_values
+
+
+def kfold_cross_validation_of_model_fn_with_synthetic_data(
+        make_model_fn,
+        n_training_epochs,
+        max_ic50,
+        X_train,
+        Y_train,
+        source_peptides_train,
+        X_synth,
+        Y_synth,
+        original_sample_weights,
+        synthetic_sample_weights,
+        n_cross_validation_folds,
+        combine_multiple_predictions_fn=np.median,
+        random_seed=0):
+    """
+    Given a function which generates fresh copies of a model, use k-fold
+    cross validation (stratified by the list of source peptide sequences) to
+    evaluate the predictive performance of this model type.
+
+    Returns a list of pairs, the first element is a trained model and the second
+    element is a PredictionScores object with the following fields:
+        ("tau", "f1", "auc", "accuracy")
+    """
+    assert len(X_train) == len(Y_train)
+    assert len(source_peptides_train) == len(X_train)
+    # randomly shuffle the training data first
+    indices = np.arange(len(X_train))
+    np.random.seed(random_seed)
+    np.random.shuffle(indices)
+    X_train, Y_train, weights_train = \
+        X_train[indices], Y_train[indices], original_sample_weights[indices]
+    source_peptides_train = [source_peptides_train[i] for i in indices]
+
+    # we need to do cross-validation carefully since some of our training
+    # samples may be extracted from the same longer peptide.
+    # To avoid training vs. test contamination we do k-fold cross validation
+    # stratified by the "source" peptides of each sample.
+    cv_iterator = enumerate(
+        LabelKFold(
+            labels=source_peptides_train,
+            n_folds=n_cross_validation_folds))
+    results = []
+    for fold_number, (fold_train_index, fold_test_index) in cv_iterator:
+        model = make_model_fn()
+        X_train_fold, X_test_fold = \
+            X_train[fold_train_index], X_train[fold_test_index]
+        weights_train_fold = weights_train[fold_train_index]
+        Y_train_fold, Y_test_fold = \
+            Y_train[fold_train_index], Y_train[fold_test_index]
+        peptides_train_fold = [
+            source_peptides_train[i]
+            for i in fold_train_index
+        ]
+        peptides_test_fold = [
+            source_peptides_train[i]
+            for i in fold_test_index
+        ]
+
+        train_model_with_synthetic_data(
+            model=model,
+            n_training_epochs=n_training_epochs,
+            X_original=X_train_fold,
+            Y_original=Y_train_fold,
+            X_synth=X_synth,
+            Y_synth=Y_synth,
+            original_sample_weights=weights_train_fold,
+            synthetic_sample_weights=synthetic_sample_weights)
+        Y_pred = model.predict(X_test_fold).flatten()
+        # since some 'epitopes' are actually substrings of longer peptides
+        # we need to coalesce those predictions by calling the function
+        # `combine_multiple_predictions_fn` on the set of predictions
+        # associated with one longer peptide
+        collapsed_predictions, collapsed_true_values = \
+            collapse_peptide_group_affinities(
+                predictions=Y_pred,
+                true_values=Y_test_fold,
+                original_peptide_sequences=peptides_test_fold,
+                combine_multiple_predictions_fn=combine_multiple_predictions_fn)
+        scores = score_predictions(
+            predicted_log_ic50=collapsed_predictions,
+            true_log_ic50=collapsed_true_values,
+            max_ic50=max_ic50)
+        print("::: CV fold %d/%d (n_samples=%d, n_unique=%d): %s\n\n" % (
+            fold_number + 1,
+            n_cross_validation_folds,
+            len(peptides_train_fold),
+            len(set(peptides_train_fold)),
+            scores))
+        results.append((model, scores))
+    return results
+
+
+def average_prediction_scores_list(model_scores_list):
+    """
+    Given a list of (model, PredictionScores) pairs,
+    returns a single PredictionScores object whose fields are
+    the average across the list.
+    """
+    n = len(model_scores_list)
+    return PredictionScores(
+        tau=sum(x.tau for (_, x) in model_scores_list) / n,
+        auc=sum(x.auc for (_, x) in model_scores_list) / n,
+        f1=sum(x.f1 for (_, x) in model_scores_list) / n,
+        accuracy=sum(x.accuracy for (_, x) in model_scores_list) / n)
+
+
+def kfold_cross_validation_of_model_params_with_synthetic_data(
+        X_original,
+        Y_original,
+        source_peptides_original,
+        X_synth=None,
+        Y_synth=None,
+        original_sample_weights=None,
+        synthetic_sample_weights=None,
+        n_training_epochs=150,
+        n_cross_validation_folds=5,
+        embedding_dim_size=16,
+        hidden_layer_size=50,
+        dropout_probability=0.0,
+        activation="tanh",
+        max_ic50=50000.0):
+    """
+    Returns:
+        - PredictionScores object with average Kendall-tau, AUC, F1,
+        and accuracy across all cross-validation folds.
+        - list of all (model, PredictionScores) objects from cross-validation
+        folds
+    """
+    n_unique_samples = len(set(source_peptides_original))
+
+    if n_cross_validation_folds > n_unique_samples:
+        n_cross_validation_folds = n_unique_samples
+
+    if original_sample_weights is None:
+        original_sample_weights = np.ones(len(X_original), dtype=float)
+
+    # if synthetic data is missing then make an empty array so all the
+    # downstream code still works
+    if X_synth is None:
+        X_synth = np.array([[]])
+
+    # if only the target values for synthetic data are missing then
+    # treat all synthetic entries as negative examples
+    if Y_synth is None:
+        Y_synth = np.zeros(len(X_synth), dtype=float)
+
+    if synthetic_sample_weights is None:
+        synthetic_sample_weights = np.ones(len(X_synth), dtype=float)
+
+    def make_model():
+        return mhcflurry.feedforward.make_embedding_network(
+            peptide_length=9,
+            embedding_input_dim=20,
+            embedding_output_dim=embedding_dim_size,
+            layer_sizes=[hidden_layer_size],
+            activation=activation,
+            init="lecun_uniform",
+            loss="mse",
+            output_activation="sigmoid",
+            dropout_probability=dropout_probability,
+            optimizer=None,
+            learning_rate=0.001)
+
+    model_scores_list = kfold_cross_validation_of_model_fn_with_synthetic_data(
+        make_model_fn=make_model,
+        n_training_epochs=n_training_epochs,
+        max_ic50=max_ic50,
+        X_train=X_original,
+        Y_train=Y_original,
+        source_peptides_train=source_peptides_original,
+        X_synth=X_synth,
+        Y_synth=Y_synth,
+        original_sample_weights=original_sample_weights,
+        synthetic_sample_weights=synthetic_sample_weights,
+        n_cross_validation_folds=n_cross_validation_folds)
+    return model_scores_list
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -13,15 +13,19 @@
 # limitations under the License.

 from . import paths
-from . import data_helpers
+from . import data
 from . import feedforward
 from . import common
-from .mhc1_binding_predictor import Mhc1BindingPredictor
+from . import peptide_encoding
+from . import amino_acid
+from .class1_binding_predictor import Class1BindingPredictor

 __all__ = [
    "paths",
-    "data_helpers",
+    "data",
    "feedforward",
+    "peptide_encoding",
+    "amino_acid",
    "common",
-    "Mhc1BindingPredictor"
-]
\ No newline at end of file
+    "Class1BindingPredictor"
+]
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -12,7 +12,81 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-amino_acids = {
+import numpy as np
+
+
+class Alphabet(object):
+    """
+    Used to track the order of amino acids used for peptide encodings
+    """
+
+    def __init__(self, **kwargs):
+        self.letters_to_names = {}
+        for (k, v) in kwargs.items():
+            self.add(k, v)
+
+    def add(self, letter, name):
+        assert letter not in self.letters_to_names
+        assert len(letter) == 1
+        self.letters_to_names[letter] = name
+
+    def letters(self):
+        return list(sorted(self.letters_to_names.keys()))
+
+    def names(self):
+        return [self.letters_to_names[k] for k in self.letters()]
+
+    def index_dict(self):
+        return {c: i for (i, c) in enumerate(self.letters())}
+
+    def copy(self):
+        return Alphabet(**self.letters_to_names)
+
+    def __getitem__(self, k):
+        return self.letters_to_names[k]
+
+    def __setitem__(self, k, v):
+        self.add(k, v)
+
+    def index_encoding_list(self, peptides):
+        index_dict = self.index_dict()
+        return [
+            [index_dict[amino_acid] for amino_acid in peptide]
+            for peptide in peptides
+        ]
+
+    def index_encoding(self, peptides, peptide_length):
+        """
+        Encode a set of equal length peptides as a matrix of their
+        amino acid indices.
+        """
+        X = np.zeros((len(peptides), peptide_length), dtype=int)
+        index_dict = self.index_dict()
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                X[i, j] = index_dict[amino_acid]
+        return X
+
+    def hotshot_encoding(
+            self,
+            peptides,
+            peptide_length):
+        """
+        Encode a set of equal length peptides as a binary matrix,
+        where each letter is transformed into a length 20 vector with a single
+        element that is 1 (and the others are 0).
+        """
+        shape = (len(peptides), peptide_length, 20)
+        index_dict = self.index_dict()
+        X = np.zeros(shape, dtype=bool)
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                k = index_dict[amino_acid]
+                X[i, j, k] = 1
+        return X
+
+
+common_amino_acids = Alphabet(**{
    "A": "Alanine",
    "R": "Arginine",
    "N": "Asparagine",
@@ -33,8 +107,9 @@ amino_acids = {
    "W": "Tryptophan",
    "Y": "Tyrosine",
    "V": "Valine",
-}
+})
+common_amino_acid_letters = common_amino_acids.letters()

-amino_acid_letters = list(sorted(amino_acids.keys()))
-amino_acid_names = [amino_acids[k] for k in amino_acid_letters]
-amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)}
+amino_acids_with_unknown = common_amino_acids.copy()
+amino_acids_with_unknown.add("X", "Unknown")
+amino_acids_with_unknown_letters = amino_acids_with_unknown.letters()
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Allele specific MHC Class I binding affinity predictor
+"""
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+import logging
+from os import listdir, remove
+from os.path import exists, join
+
+import json
+
+import numpy as np
+from keras.models import model_from_config
+
+
+from .common import normalize_allele_name
+from .paths import CLASS1_MODEL_DIRECTORY
+from .feedforward import make_embedding_network
+from .predictor_base import PredictorBase
+
+from .class1_allele_specific_hyperparameters import MAX_IC50
+
+_allele_predictor_cache = {}
+
+class Class1BindingPredictor(PredictorBase):
+    def __init__(
+            self,
+            model,
+            name=None,
+            max_ic50=MAX_IC50,
+            allow_unknown_amino_acids=False,
+            verbose=False):
+        PredictorBase.__init__(
+            self,
+            name=name,
+            max_ic50=max_ic50,
+            allow_unknown_amino_acids=allow_unknown_amino_acids,
+            verbose=verbose)
+        self.name = name
+        self.model = model
+
+    @classmethod
+    def from_disk(
+            cls,
+            model_json_path,
+            weights_hdf_path=None,
+            name=None,
+            max_ic50=MAX_IC50):
+        """
+        Load model from stored JSON representation of network and
+        (optionally) load weights from HDF5 file.
+        """
+        if not exists(model_json_path):
+            raise ValueError("Model file %s (name = %s) not found" % (
+                model_json_path, name,))
+
+        with open(model_json_path, "r") as f:
+            config_dict = json.load(f)
+
+        model = model_from_config(config_dict)
+
+        if weights_hdf_path:
+            if not exists(weights_hdf_path):
+                raise ValueError(
+                    "Missing model weights file %s (name = %s)" % (
+                        weights_hdf_path, name))
+
+            model.load_weights(weights_hdf_path)
+
+        return cls.__init__(
+            model=model,
+            max_ic50=max_ic50,
+            name=name)
+
+    @classmethod
+    def from_hyperparameters(
+            cls,
+            name=None,
+            max_ic50=MAX_IC50,
+            peptide_length=9,
+            embedding_input_dim=20,
+            embedding_output_dim=20,
+            layer_sizes=[50],
+            activation="tanh",
+            init="lecun_uniform",
+            loss="mse",
+            output_activation="sigmoid",
+            dropout_probability=0,
+            learning_rate=0.001,
+            **kwargs):
+        """
+        Create untrained predictor with the given hyperparameters.
+        """
+        model = make_embedding_network(
+            peptide_length=peptide_length,
+            embedding_input_dim=embedding_input_dim,
+            embedding_output_dim=embedding_output_dim,
+            layer_sizes=layer_sizes,
+            activation=activation,
+            init=init,
+            loss=loss,
+            output_activation=output_activation,
+            dropout_probability=dropout_probability,
+            learning_rate=learning_rate)
+        return cls(
+            name=name,
+            max_ic50=max_ic50,
+            model=model,
+            **kwargs)
+
+    def _combine_training_data(
+            self,
+            X,
+            Y,
+            sample_weights,
+            X_pretrain,
+            Y_pretrain,
+            pretrain_sample_weights,
+            verbose=False):
+        """
+        Make sure the shapes of given training and pre-training data
+        conform with each other. Then concatenate the pre-training and the
+        training data.
+
+        Returns (X_combined, Y_combined, weights_combined, n_pretrain_samples)
+        """
+        X = np.asarray(X)
+        Y = np.asarray(Y)
+
+        if verbose:
+            print("X.shape = %s" % (X.shape,))
+
+        n_samples, n_dims = X.shape
+
+        if len(Y) != n_samples:
+            raise ValueError("Mismatch between len(X) = %d and len(Y) = %d" % (
+                n_samples, len(Y)))
+
+        if Y.min() < 0:
+            raise ValueError("Minimum value of Y can't be negative, got %f" % (
+                Y.min()))
+        if Y.max() > 1:
+            raise ValueError("Maximum value of Y can't be greater than 1, got %f" % (
+                Y.max()))
+
+        if sample_weights is None:
+            sample_weights = np.ones_like(Y)
+        else:
+            sample_weights = np.asarray(sample_weights)
+
+        if len(sample_weights) != n_samples:
+            raise ValueError(
+                "Length of sample_weights (%d) doesn't match number of samples (%d)" % (
+                    len(sample_weights),
+                    n_samples))
+
+        if X_pretrain is None or Y_pretrain is None:
+            X_pretrain = np.empty((0, n_dims), dtype=X.dtype)
+            Y_pretrain = np.empty((0,), dtype=Y.dtype)
+        else:
+            X_pretrain = np.asarray(X_pretrain)
+            Y_pretrain = np.asarray(Y_pretrain)
+
+        if verbose:
+            print("X_pretrain.shape = %s" % (X_pretrain.shape,))
+        n_pretrain_samples, n_pretrain_dims = X_pretrain.shape
+        if n_pretrain_dims != n_dims:
+            raise ValueError(
+                "# of dims for pretraining data (%d) doesn't match X.shape[1] = %d" % (
+                    n_pretrain_dims, n_dims))
+
+        if len(Y_pretrain) != n_pretrain_samples:
+            raise ValueError(
+                "length of Y_pretrain (%d) != length of X_pretrain (%d)" % (
+                    len(Y_pretrain),
+                    len(X_pretrain)))
+
+        if len(Y_pretrain) > 0 and Y_pretrain.min() < 0:
+            raise ValueError("Minimum value of Y_pretrain can't be negative, got %f" % (
+                Y.min()))
+        if len(Y_pretrain) > 0 and Y_pretrain.max() > 1:
+            raise ValueError("Maximum value of Y_pretrain can't be greater than 1, got %f" % (
+                Y.max()))
+
+        if pretrain_sample_weights is None:
+            pretrain_sample_weights = np.ones_like(Y_pretrain)
+        else:
+            pretrain_sample_weights = np.asarray(pretrain_sample_weights)
+        if verbose:
+            print("sample weights mean = %f, pretrain weights mean = %f" % (
+                sample_weights.mean(),
+                pretrain_sample_weights.mean()))
+        X_combined = np.vstack([X_pretrain, X])
+        Y_combined = np.concatenate([Y_pretrain, Y])
+        combined_weights = np.concatenate([
+            pretrain_sample_weights,
+            sample_weights,
+        ])
+        return X_combined, Y_combined, combined_weights, n_pretrain_samples
+
+    def fit(
+            self,
+            X,
+            Y,
+            sample_weights=None,
+            X_pretrain=None,
+            Y_pretrain=None,
+            pretrain_sample_weights=None,
+            n_training_epochs=200,
+            verbose=False,
+            batch_size=128):
+        """
+        Train predictive model from index encoding of fixed length 9mer peptides.
+
+        Parameters
+        ----------
+        X : array
+            Training data with shape (n_samples, n_dims)
+
+        Y : array
+            Training labels with shape (n_samples,)
+
+        sample_weights : array
+            Weight of each training sample with shape (n_samples,)
+
+        X_pretrain : array
+            Extra samples used for soft pretraining of the predictor,
+            should have same number of dimensions as X. During training the weights
+            of these samples will decay after each epoch.
+
+        Y_pretrain : array
+            Labels for extra samples, shape
+
+        pretrain_sample_weights : array
+            Initial weights for the rows of X_pretrain. If not specified then
+            initialized to ones.
+
+        n_training_epochs : int
+
+        verbose : bool
+
+        batch_size : int
+        """
+        X_combined, Y_combined, combined_weights, n_pretrain = \
+            self._combine_training_data(
+                X, Y, sample_weights, X_pretrain, Y_pretrain, pretrain_sample_weights,
+                verbose=verbose)
+
+        total_pretrain_sample_weight = combined_weights[:n_pretrain].sum()
+        total_train_sample_weight = combined_weights[n_pretrain:].sum()
+        total_combined_sample_weight = (
+            total_pretrain_sample_weight + total_train_sample_weight)
+
+        if self.verbose:
+            print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % (
+                total_pretrain_sample_weight,
+                100 * total_pretrain_sample_weight / total_combined_sample_weight,
+                total_train_sample_weight,
+                100 * total_train_sample_weight / total_combined_sample_weight))
+
+        for epoch in range(n_training_epochs):
+            # weights for synthetic points can be shrunk as:
+            #  ~ 1 / (1+epoch)**2
+            # or
+            # e ** -epoch
+            decay_factor = np.exp(-epoch)
+            # if the contribution of synthetic samples is less than a
+            # thousandth of the actual data, then stop using it
+            pretrain_contribution = total_pretrain_sample_weight * decay_factor
+            pretrain_fraction_contribution = (
+                pretrain_contribution / total_combined_sample_weight)
+
+            use_pretrain_data = pretrain_fraction_contribution > 0.001
+
+            # only use synthetic data if it contributes at least 1/1000th of
+            # sample weight
+            if verbose:
+
+                real_data_percent = (
+                    ((1.0 - pretrain_fraction_contribution) * 100)
+                    if use_pretrain_data
+                    else 100
+                )
+                pretrain_data_percent = (
+                    (pretrain_fraction_contribution * 100)
+                    if use_pretrain_data else 0
+                )
+                print(
+                    ("-- Epoch %d/%d decay=%f, real data weight=%0.2f%%,"
+                     " synth data weight=%0.2f%% (use_pretrain=%s)") % (
+                        epoch + 1,
+                        n_training_epochs,
+                        decay_factor,
+                        real_data_percent,
+                        pretrain_data_percent,
+                        use_pretrain_data))
+
+            if use_pretrain_data:
+                combined_weights[:n_pretrain] *= decay_factor
+                self.model.fit(
+                    X_combined,
+                    Y_combined,
+                    sample_weight=combined_weights,
+                    nb_epoch=1,
+                    verbose=0,
+                    batch_size=batch_size)
+            else:
+                self.model.fit(
+                    X_combined[n_pretrain:],
+                    Y_combined[n_pretrain:],
+                    sample_weight=combined_weights[n_pretrain:],
+                    nb_epoch=1,
+                    verbose=0,
+                    batch_size=batch_size)
+
+    def to_disk(self, model_json_path, weights_hdf_path, overwrite=False):
+        if exists(model_json_path) and overwrite:
+            logging.info(
+                "Removing existing model JSON file '%s'" % (
+                    model_json_path,))
+            remove(model_json_path)
+
+        if exists(model_json_path):
+            logging.warn(
+                "Model JSON file '%s' already exists" % (model_json_path,))
+        else:
+            logging.info(
+                "Saving model file %s (name=%s)" % (model_json_path, self.name))
+            with open(model_json_path, "w") as f:
+                f.write(self.model.to_json())
+
+        if exists(weights_hdf_path) and overwrite:
+            logging.info(
+                "Removing existing model weights HDF5 file '%s'" % (
+                    weights_hdf_path,))
+            remove(weights_hdf_path)
+
+        if exists(weights_hdf_path):
+            logging.warn(
+                "Model weights HDF5 file '%s' already exists" % (
+                    weights_hdf_path,))
+        else:
+            logging.info(
+                "Saving model weights HDF5 file %s (name=%s)" % (
+                    weights_hdf_path, self.name))
+            self.model.save_weights(weights_hdf_path)
+
+    @classmethod
+    def from_allele_name(
+            cls,
+            allele_name,
+            model_directory=CLASS1_MODEL_DIRECTORY,
+            max_ic50=MAX_IC50):
+        if not exists(model_directory) or len(listdir(model_directory)) == 0:
+            raise ValueError(
+                "No MHC prediction models found in %s" % (model_directory,))
+
+        allele_name = normalize_allele_name(allele_name)
+        key = (allele_name, model_directory, max_ic50)
+        if key in _allele_predictor_cache:
+            return _allele_predictor_cache[key]
+
+        if not exists(model_directory) or len(listdir(model_directory)) == 0:
+            raise ValueError(
+                "No MHC prediction models found in %s" % (model_directory,))
+
+        model_json_filename = allele_name + ".json"
+        model_json_path = join(model_directory, model_json_filename)
+
+        weights_hdf_filename = allele_name + ".hdf"
+        weights_hdf_path = join(model_directory, weights_hdf_filename)
+
+        predictor = cls.from_disk(
+            model_json_path=model_json_path,
+            weights_hdf_path=weights_hdf_path,
+            name=allele_name,
+            max_ic50=max_ic50)
+
+        _allele_predictor_cache[key] = predictor
+        return predictor
+
+    @classmethod
+    def supported_alleles(cls, model_directory=CLASS1_MODEL_DIRECTORY):
+        alleles_with_weights = set([])
+        alleles_with_models = set([])
+        for filename in listdir(model_directory):
+            if filename.endswith(".hdf"):
+                alleles_with_weights.add(filename.replace(".hdf", ""))
+            elif filename.endswith(".json"):
+                alleles_with_models.add(filename.replace(".json", ""))
+        alleles = alleles_with_models.intersection(alleles_with_weights)
+        return list(sorted(alleles))
+
+    def __repr__(self):
+        return "Class1BindingPredictor(name=%s, model=%s, max_ic50=%f)" % (
+            self.name,
+            self.model,
+            self.max_ic50)
+
+    def __str__(self):
+        return repr(self)
+
+    def predict_encoded(self, X):
+        max_expected_index = 20 if self.allow_unknown_amino_acids else 19
+        assert X.max() <= max_expected_index, \
+            "Got index %d in peptide encoding" % (X.max(),)
+        return self.model.predict(X, verbose=False).flatten()
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -17,7 +17,6 @@ from __future__ import (
    division,
    absolute_import,
 )
-from .amino_acid import amino_acid_letters


 def parse_int_list(s):
@@ -27,35 +26,37 @@ def parse_int_list(s):
 def split_uppercase_sequences(s):
    return [part.strip().upper() for part in s.split(",")]

+MHC_PREFIXES = [
+    "HLA",
+    "H-2",
+    "Mamu",
+    "Patr",
+    "Gogo",
+    "ELA",
+]

-def normalize_allele_name(allele_name):
+
+def normalize_allele_name(allele_name, default_prefix="HLA"):
    """
-    Only works for mouse, human, and rhesus monkey alleles.
+    Only works for a small number of species.

    TODO: use the same logic as mhctools for MHC name parsing.
    Possibly even worth its own small repo called something like "mhcnames"
    """
    allele_name = allele_name.upper()
-    if allele_name.startswith("MAMU"):
-        prefix = "Mamu-"
-    elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
-        prefix = "H-2-"
-    else:
-        prefix = ""
    # old school HLA-C serotypes look like "Cw"
    allele_name = allele_name.replace("CW", "C")
-    patterns = [
-        "HLA-",
-        "H-2",
-        "H2",
-        "MAMU",
-        "-",
-        "*",
-        ":"
-    ]
-    for pattern in patterns:
+
+    prefix = default_prefix
+    for candidate in MHC_PREFIXES:
+        if (allele_name.startswith(candidate.upper()) or
+                allele_name.startswith(candidate.replace("-", "").upper())):
+            prefix = candidate
+            allele_name = allele_name[len(prefix):]
+            break
+    for pattern in MHC_PREFIXES + ["-", "*", ":"]:
        allele_name = allele_name.replace(pattern, "")
-    return "%s%s" % (prefix, allele_name)
+    return "%s%s" % (prefix + "-" if prefix else "", allele_name)


 def split_allele_names(s):
@@ -64,37 +65,3 @@ def split_allele_names(s):
        for part
        in s.split(",")
    ]
-
-
-def expand_9mer_peptides(peptides, length):
-    """
-    Expand non-9mer peptides using methods from
-       Accurate approximation method for prediction of class I MHC
-       affinities for peptides of length 8, 10 and 11 using prediction
-       tools trained on 9mers.
-    by Lundegaard et. al.
-    http://bioinformatics.oxfordjournals.org/content/24/11/1397
-    """
-    assert len(peptides) > 0
-    if length < 8:
-        raise ValueError("Invalid peptide length: %d (%s)" % (
-            length, peptides[0]))
-    elif length == 9:
-        return peptides
-    elif length == 8:
-        # extend each peptide by inserting every possible amino acid
-        # between base-1 positions 4-8
-        return [
-            peptide[:i] + extra_amino_acid + peptide[i:]
-            for peptide in peptides
-            for i in range(3, 8)
-            for extra_amino_acid in amino_acid_letters
-        ]
-    else:
-        # drop interior residues between base-1 positions 4 to last
-        n_skip = length - 9
-        return [
-            peptide[:i] + peptide[i + n_skip:]
-            for peptide in peptides
-            for i in range(3, 9)
-        ]
--- a/mhcflurry/data_helpers.py
+++ b/mhcflurry/data_helpers.py
@@ -17,59 +17,34 @@ from __future__ import (
    division,
    absolute_import,
 )
-from collections import namedtuple
+from collections import namedtuple, defaultdict

 import pandas as pd
 import numpy as np

 from .common import normalize_allele_name
-from .amino_acid import amino_acid_letter_indices
-
-AlleleData = namedtuple("AlleleData", "X Y peptides ic50")
-
-
-def hotshot_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a binary matrix,
-    where each letter is transformed into a length 20 vector with a single
-    element that is 1 (and the others are 0).
-    """
-    shape = (len(peptides), peptide_length, 20)
-    X = np.zeros(shape, dtype=bool)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            k = amino_acid_letter_indices[amino_acid]
-            X[i, j, k] = 1
-    return X
-
-
-def index_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a vector of their
-    amino acid indices.
-    """
-    X = np.zeros((len(peptides), peptide_length), dtype=int)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            X[i, j] = amino_acid_letter_indices[amino_acid]
-    return X
-
+from .amino_acid import common_amino_acids
+from .peptide_encoding import (
+    indices_to_hotshot_encoding,
+    fixed_length_from_many_peptides
+)

-def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
-    """
-    Given an (n_samples, peptide_length) integer matrix
-    convert it to a binary encoding of shape:
-        (n_samples, peptide_length * n_indices)
-    """
-    (n_samples, peptide_length) = X.shape
-    if not n_indices:
-        n_indices = X.max() - first_index_value + 1
+index_encoding = common_amino_acids.index_encoding

-    X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
-    for i, row in enumerate(X):
-        for j, xij in enumerate(row):
-            X_binary[i, n_indices * j + xij - first_index_value] = 1
-    return X_binary.astype(float)
+AlleleData = namedtuple(
+    "AlleleData",
+    [
+        "X_index",    # index-based featue encoding of fixed length peptides
+        "X_binary",  # binary encoding of fixed length peptides
+        "Y",     # regression encoding of IC50 (log scaled between 0..1)
+        "peptides",  # list of fixed length peptide string
+        "ic50",      # IC50 value associated with each entry
+        "original_peptides",  # original peptides may be of different lengths
+        "original_lengths",   # len(original_peptide)
+        "substring_counts",   # how many substrings were extracted from
+                              # each original peptide string
+        "weights",    # 1.0 / count
+    ])


 def _infer_csv_separator(filename):
@@ -168,14 +143,115 @@ def load_dataframe(
    regression_output = np.maximum(regression_output, 0.0)
    regression_output = np.minimum(regression_output, 1.0)
    df["regression_output"] = regression_output
-    return df
+    return df, peptide_column_name
+
+
+def load_allele_dicts(
+        filename,
+        max_ic50=50000.0,
+        regression_output=False,
+        sep=None,
+        species_column_name="species",
+        allele_column_name="mhc",
+        peptide_column_name=None,
+        peptide_length_column_name="peptide_length",
+        ic50_column_name="meas",
+        only_human=True,
+        min_allele_size=1):
+    """
+    Parsing CSV of binding data into dictionary of dictionaries.
+    The outer key is an allele name, the inner key is a peptide sequence,
+    and the inner value is an IC50 or log-transformed value between [0,1]
+    """
+    binding_df, peptide_column_name = load_dataframe(
+        filename=filename,
+        max_ic50=max_ic50,
+        sep=sep,
+        species_column_name=species_column_name,
+        allele_column_name=allele_column_name,
+        peptide_column_name=peptide_column_name,
+        peptide_length_column_name=peptide_length_column_name,
+        ic50_column_name=ic50_column_name,
+        only_human=only_human)
+    # map peptides to either the raw IC50 or rescaled log IC50 depending
+    # on the `regression_output` parameter
+    output_column_name = (
+        "regression_output"
+        if regression_output
+        else ic50_column_name
+    )
+    return {
+        allele_name: {
+            peptide: output
+            for (peptide, output)
+            in zip(group[peptide_column_name], group[output_column_name])
+        }
+        for (allele_name, group)
+        in binding_df.groupby(allele_column_name)
+        if len(group) >= min_allele_size
+    }
+
+
+def encode_peptide_to_affinity_dict(
+        peptide_to_affinity_dict,
+        peptide_length=9,
+        flatten_binary_encoding=True):
+    """
+    Given a dictionary mapping from peptide sequences to affinity values,
+    returns tuple with the following fields:
+        - kmer_peptides: fixed length peptide strings
+        - original_peptides: variable length peptide strings
+        - counts: how many fixed length peptides were made from this original
+        - X_index: index encoding of fixed length peptides
+        - X_binary: binary encoding of fixed length peptides
+        - Y: affinity values associated with original peptides
+    """
+    raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
+    kmer_peptides, original_peptides, counts = \
+        fixed_length_from_many_peptides(
+            peptides=raw_peptides,
+            desired_length=peptide_length,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            start_offset_extend=0,
+            end_offset_extend=0)
+    n_samples = len(kmer_peptides)
+    assert n_samples == len(original_peptides), \
+        "Mismatch between # of samples (%d) and # of peptides (%d)" % (
+            n_samples, len(original_peptides))
+    assert n_samples == len(counts), \
+        "Mismatch between # of samples (%d) and # of counts (%d)" % (
+            n_samples, len(counts))
+
+    X_index = index_encoding(kmer_peptides, peptide_length)
+    X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+
+    assert X_binary.shape[0] == X_index.shape[0], \
+        ("Mismatch between number of samples for index encoding (%d)"
+         " vs. binary encoding (%d)") % (
+            X_binary.shape[0],
+            X_index.shape[0])
+
+    if flatten_binary_encoding:
+        # collapse 3D input into 2D matrix
+        n_binary_features = peptide_length * 20
+        X_binary = X_binary.reshape((n_samples, n_binary_features))
+
+    # easier to work with counts when they're an array instead of list
+    counts = np.array(counts)
+
+    Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
+    assert n_samples == len(Y), \
+        "Mismatch between # peptides %d and # regression outputs %d" % (
+            n_samples, len(Y))
+    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)


 def load_allele_datasets(
        filename,
        peptide_length=9,
-        max_ic50=5000.0,
-        binary_encoding=True,
+        use_multiple_peptide_lengths=True,
+        max_ic50=50000.0,
        flatten_binary_encoding=True,
        sep=None,
        species_column_name="species",
@@ -202,14 +278,18 @@ def load_allele_datasets(
    peptide_length : int
        Which length peptides to use (default=9)

+    use_multiple_peptide_lengths : bool
+        If a peptide is shorter than `peptide_length`, expand it into many
+        peptides of the appropriate length by inserting all combinations of
+        amino acids. Similarly, if a peptide is longer than `peptide_length`,
+        shorten it by deleting stretches of contiguous amino acids at all
+        peptide positions.
+
    max_ic50 : float
        Treat IC50 scores above this value as all equally bad
        (transform them to 0.0 in the rescaled output)

-    binary_encoding : bool
-        Encode amino acids of each peptide as indices or binary vectors
-
-    flatten_features : bool
+    flatten_binary_encoding : bool
        If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
        returns the 2D flattened version of the same data.

@@ -222,7 +302,7 @@ def load_allele_datasets(
    only_human : bool
        Only load entries from human MHC alleles
    """
-    df = load_dataframe(
+    df, peptide_column_name = load_dataframe(
        filename=filename,
        max_ic50=max_ic50,
        sep=sep,
@@ -236,21 +316,83 @@ def load_allele_datasets(

    allele_groups = {}
    for allele, group in df.groupby(allele_column_name):
-        ic50 = np.array(group[ic50_column_name])
-        Y = np.array(group["regression_output"])
-        peptides = list(group[peptide_column_name])
-        if binary_encoding:
-            X = hotshot_encoding(peptides, peptide_length=peptide_length)
-            if flatten_binary_encoding:
-                # collapse 3D input into 2D matrix
-                X = X.reshape((X.shape[0], peptide_length * 20))
-        else:
-            X = index_encoding(peptides, peptide_length=peptide_length)
        assert allele not in allele_groups, \
            "Duplicate datasets for %s" % allele
+
+        raw_peptides = group[peptide_column_name]
+
+        # filter lengths in case user wants to drop peptides that are longer
+        # or shorter than the desired fixed length
+        if not use_multiple_peptide_lengths:
+            drop_mask = raw_peptides.str.len() != peptide_length
+            group = group[~drop_mask]
+            raw_peptides = raw_peptides[~drop_mask]
+
+        # convert from a Pandas column to a list, since that's easier to
+        # interact with later
+        raw_peptides = list(raw_peptides)
+
+        # create dictionaries of outputs from which we can look up values
+        # after peptides have been expanded
+        ic50_dict = {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(raw_peptides, group[ic50_column_name])
+        }
+
+        Y_dict = {
+            peptide: y
+            for (peptide, y)
+            in zip(raw_peptides, group["regression_output"])
+        }
+
+        (kmer_peptides, original_peptides, counts, X_index, X_binary, Y) = \
+            encode_peptide_to_affinity_dict(
+                Y_dict,
+                peptide_length=peptide_length,
+                flatten_binary_encoding=flatten_binary_encoding)
+
+        ic50 = np.array([ic50_dict[p] for p in original_peptides])
+
+        assert len(kmer_peptides) == len(ic50), \
+            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+                len(kmer_peptides), len(ic50))
+
        allele_groups[allele] = AlleleData(
-            X=X,
+            X_index=X_index,
+            X_binary=X_binary,
            Y=Y,
            ic50=ic50,
-            peptides=peptides)
+            peptides=kmer_peptides,
+            original_peptides=original_peptides,
+            original_lengths=[len(peptide) for peptide in original_peptides],
+            substring_counts=counts,
+            weights=1.0 / counts)
    return allele_groups
+
+
+def collapse_multiple_peptide_entries(allele_datasets):
+    """
+    If an (allele, peptide) pair occurs multiple times then reduce it
+    to a single entry by taking the weighted average of affinity values.
+
+    Returns a dictionary of alleles, each of which maps to a dictionary of
+    peptides that map to affinity values.
+    """
+    allele_to_peptide_to_affinity = {}
+    for (allele, dataset) in allele_datasets.items():
+        multiple_affinities = defaultdict(list)
+        for (peptide, normalized_affinity, weight) in zip(
+                dataset.peptides, dataset.Y, dataset.weights):
+            multiple_affinities[peptide].append((normalized_affinity, weight))
+        weighted_averages = {}
+        for peptide, affinity_weight_tuples in multiple_affinities.items():
+            denom = 0.0
+            sum_weighted_affinities = 0.0
+            for affinity, weight in affinity_weight_tuples:
+                sum_weighted_affinities += affinity * weight
+                denom += weight
+            if denom > 0:
+                weighted_averages[peptide] = sum_weighted_affinities / denom
+        allele_to_peptide_to_affinity[allele] = weighted_averages
+    return allele_to_peptide_to_affinity
--- a/mhcflurry/ensemble.py
+++ b/mhcflurry/ensemble.py
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class Ensemble(object):
+    def __init__(self, models, name=None):
+        self.name = name
+        self.models = models
+
+    @classmethod
+    def from_directory(cls, directory_path):
+        files = os.listdir(directory_path)
+
+
+
--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -128,9 +128,10 @@ def make_hotshot_network(
        output_activation="sigmoid",
        dropout_probability=0.0,
        optimizer=None,
-        learning_rate=0.001):
+        learning_rate=0.001,
+        n_amino_acids=20):
    return make_network(
-        input_size=peptide_length * 20,
+        input_size=peptide_length * n_amino_acids,
        layer_sizes=layer_sizes,
        activation=activation,
        init=init,