From 8c83236e33d17455b1d2a7f5d7479c9d2bb5b33b Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 20 Nov 2015 18:43:31 -0500
Subject: [PATCH] moving more flexible fixed length peptide helpers into main
 repo from experiments

---
 experiments/allele-similarities.py          | 244 ++++++++++++++++++++
 experiments/compute-allele-similarities.py  | 216 -----------------
 experiments/synthesize-augmented-dataset.py | 173 ++++++++++++++
 mhcflurry/__init__.py                       |   4 +-
 mhcflurry/common.py                         |  35 ---
 mhcflurry/data_helpers.py                   |  39 ++++
 mhcflurry/fixed_length_peptides.py          | 182 +++++++++++++++
 mhcflurry/mhc1_binding_predictor.py         |   7 +-
 mhcflurry/peptide_encoding.py               |  80 +++++++
 9 files changed, 725 insertions(+), 255 deletions(-)
 create mode 100644 experiments/allele-similarities.py
 delete mode 100644 experiments/compute-allele-similarities.py
 create mode 100644 experiments/synthesize-augmented-dataset.py
 create mode 100644 mhcflurry/fixed_length_peptides.py
 create mode 100644 mhcflurry/peptide_encoding.py

diff --git a/experiments/allele-similarities.py b/experiments/allele-similarities.py
new file mode 100644
index 00000000..b75de847
--- /dev/null
+++ b/experiments/allele-similarities.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from fancyimpute import ConvexSolver
+import numpy as np
+import mhcflurry
+import seaborn
+
+from dataset_paths import PETERS2009_CSV_PATH
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--min-overlap-weight",
+    default=1.0,
+    help="Minimum overlap weight between pair of alleles",
+    type=float)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--raw-similarities-output-path",
+    help="CSV file which contains incomplete allele similarities")
+
+parser.add_argument(
+    "--complete-similarities-output-path",
+    help="CSV file which contains allele similarities after matrix completion")
+
+parser.add_argument(
+    "--raw-heatmap-output-path",
+    help="PNG file to save heatmap of incomplete similarities matrix")
+
+parser.add_argument(
+    "--complete-heatmap-output-path",
+    help="PNG file to save heatmap of complete similarities matrix")
+
+
+def compute_similarities(
+        allele_groups,
+        min_weight=1.0,
+        allele_name_length=None):
+    sims = {}
+    overlaps = {}
+    weights = {}
+    for a, da in allele_groups.items():
+        if allele_name_length and len(a) != allele_name_length:
+            continue
+        peptide_set_a = set(da.keys())
+        for b, db in allele_groups.items():
+            if allele_name_length and len(b) != allele_name_length:
+                continue
+            peptide_set_b = set(db.keys())
+            intersection = peptide_set_a.intersection(peptide_set_b)
+            overlaps[(a, b)] = len(intersection)
+            total = 0.0
+            weight = 0.0
+            for peptide in intersection:
+                ya = da[peptide]
+                yb = db[peptide]
+                minval = min(ya, yb)
+                maxval = max(ya, yb)
+                total += minval
+                weight += maxval
+            weights[(a, b)] = weight
+            if weight > min_weight:
+                sims[(a, b)] = total / weight
+    return sims, overlaps, weights
+
+
+def save_heatmap(matrix, allele_names, filename):
+    seaborn.set_context("paper")
+
+    with seaborn.plotting_context(font_scale=1):
+        figure = seaborn.plt.figure(figsize=(20, 18))
+        ax = figure.add_axes()
+        seaborn.heatmap(
+            data=matrix,
+            xticklabels=allele_names,
+            yticklabels=allele_names,
+            linewidths=0,
+            annot=False,
+            ax=ax,
+            fmt=".2g")
+        figure.savefig(filename)
+
+
+def save_csv(filename, sims, overlap_counts, overlap_weights):
+    with open(filename, "w") as f:
+        f.write("allele_A,allele_B,similarity,count,weight\n")
+        for (a, b), s in sorted(sims.items()):
+            count = overlap_counts.get((a, b), 0)
+            weight = overlap_weights.get((a, b), 0.0)
+            f.write("%s,%s,%0.4f,%d,%0.4f\n" % (a, b, s, count, weight))
+
+
+def print_dataset_sizes(allele_groups):
+    print("---\nDataset Sizes\n---")
+    for (allele_name, g) in sorted(allele_groups.items()):
+        print("%s: total=%d, 8mer=%d, 9mer=%d, 10mer=%d, 11mer=%d" % (
+            allele_name,
+            len(g),
+            sum(len(k) == 8 for k in g.keys()),
+            sum(len(k) == 9 for k in g.keys()),
+            sum(len(k) == 10 for k in g.keys()),
+            sum(len(k) == 11 for k in g.keys()),
+        ))
+    print("---")
+
+
+def build_incomplete_similarity_matrix(allele_groups, sims):
+    allele_list = list(sorted(allele_groups.keys()))
+    allele_order = {
+        allele_name: i
+        for (i, allele_name) in enumerate(sorted(allele_groups.keys()))
+    }
+    n_alleles = len(allele_list)
+    sims_incomplete_matrix = np.zeros((n_alleles, n_alleles), dtype=float)
+
+    for a, ai in allele_order.items():
+        for b, bi in allele_order.items():
+            # if allele pair is missing from similarities dict then
+            # fill its slot with NaN, indicating missing data
+            sims_incomplete_matrix[ai, bi] = sims.get((a, b), np.nan)
+    return allele_list, allele_order, sims_incomplete_matrix
+
+
+def matrix_to_dictionary(sims, allele_list):
+    sims_dict = {}
+    for i in range(sims.shape[0]):
+        a = allele_list[i]
+        for j in range(sims.shape[1]):
+            b = allele_list[j]
+            sims_dict[a, b] = sims[i, j]
+    return sims_dict
+
+
+def fill_in_similarities(
+        raw_sims_dict,
+        allele_datasets,
+        raw_sims_heatmap_path=None,
+        complete_sims_heatmap_path=None):
+    allele_list, allele_order, sims_matrix = build_incomplete_similarity_matrix(
+        allele_datasets,
+        sims=raw_sims_dict)
+
+    if raw_sims_heatmap_path:
+        save_heatmap(
+            sims_matrix,
+            allele_list,
+            raw_sims_heatmap_path)
+
+    print("Completing %s similarities matrix with %d missing entries" % (
+        sims_matrix.shape,
+        np.isnan(sims_matrix).sum()))
+
+    solver = ConvexSolver(
+        require_symmetric_solution=True,
+        min_value=0.0,
+        max_value=1.0,
+        error_tolerance=0.0001)
+
+    sims_complete_matrix = solver.complete(sims_matrix)
+
+    if complete_sims_heatmap_path:
+        save_heatmap(
+            sims_complete_matrix,
+            allele_list,
+            complete_sims_heatmap_path)
+
+    complete_sims_dict = matrix_to_dictionary(
+        sims=sims_complete_matrix,
+        allele_list=allele_list)
+    return complete_sims_dict
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    df = mhcflurry.data_helpers.load_dataframe(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human)
+    allele_groups = {
+        allele_name: {
+            row["sequence"]: row["regression_output"]
+            for (_, row) in group.iterrows()
+        }
+        for (allele_name, group) in df.groupby("mhc")
+    }
+    print_dataset_sizes(allele_groups)
+
+    raw_sims_dict, overlap_counts, overlap_weights = compute_similarities(
+        allele_groups,
+        min_weight=args.min_overlap_weight)
+
+    if args.raw_similarities_output_path:
+        save_csv(
+            args.raw_similarities_output_path,
+            raw_sims_dict,
+            overlap_counts,
+            overlap_weights)
+
+    complete_sims_dict = fill_in_similarities(
+        raw_sims_dict=raw_sims_dict,
+        allele_datasets=allele_groups,
+        raw_sims_heatmap_path=args.raw_heatmap_output_path,
+        complete_sims_heatmap_path=args.complete_heatmap_output_path)
+
+    print("-- Added %d/%d allele similarities" % (
+        len(complete_sims_dict) - len(raw_sims_dict),
+        len(complete_sims_dict)))
+
+    if args.complete_similarities_output_path:
+        save_csv(
+            args.complete_similarities_output_path,
+            complete_sims_dict,
+            overlap_counts,
+            overlap_weights)
diff --git a/experiments/compute-allele-similarities.py b/experiments/compute-allele-similarities.py
deleted file mode 100644
index 290e4a7a..00000000
--- a/experiments/compute-allele-similarities.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import argparse
-
-import fancyimpute
-import numpy as np
-import mhcflurry
-
-from dataset_paths import PETERS2009_CSV_PATH
-
-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    "--binding-data-csv",
-    default=PETERS2009_CSV_PATH)
-
-parser.add_argument(
-    "--fill-missing-values",
-    action="store_true",
-    default=False)
-
-
-parser.add_argument(
-    "--min-overlap-weight",
-    default=2.0,
-    help="Minimum overlap weight between pair of alleles")
-
-parser.add_argument(
-    "--max-ic50",
-    default=50000.0)
-
-parser.add_argument(
-    "--output-csv",
-    required=True)
-
-def binary_encode(X, n_indices=20):
-    n_cols = X.shape[1]
-    X_encode = np.zeros( (len(X), n_indices * n_cols), dtype=float)
-    for i in range(len(X)):
-        for col_idx in range(n_cols):
-            X_encode[i, col_idx*n_indices + X[i, col_idx]] = True
-    return X_encode
-
-def prepare_incomplete_matrix(datasets_dict, prefix=None, allele_name_length=5):
-    if allele_name_length:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if len(k) == allele_name_length}
-    if prefix:
-        datasets_dict = {k:d for (k,d) in datasets_dict.items() if k.startswith(prefix)}
-    if len(datasets_dict) == 0:
-        raise ValueError("No alleles matched criteria")
-    all_peptides = set([])
-    allele_list = []
-    for k,d in sorted(datasets_dict.items()):
-        print(k, len(d.peptides))
-        allele_list.append(k)
-        for p in d.peptides:
-            all_peptides.add(p)
-    peptide_list = list(sorted(all_peptides))
-    allele_order = {a:i for (i,a) in enumerate(allele_list)}
-    peptide_order = {p:i for (i,p) in enumerate(peptide_list)}
-    n_alleles = len(allele_order)
-    n_peptides = len(peptide_order)
-    incomplete_affinity_matrix = np.ones((n_alleles, n_peptides), dtype=float) * np.nan
-    for k,d in datasets_dict.items():
-        if k not in allele_order:
-            continue
-        i = allele_order[k]
-        for p,y in zip(d.peptides, d.Y):
-            j = peptide_order[p]
-            incomplete_affinity_matrix[i,j] = y
-    print("Total # alleles = %d, peptides = %d" % (n_alleles, n_peptides))
-    return incomplete_affinity_matrix, allele_order, peptide_order
-
-def get_extra_data(allele, train_datasets, expanded_predictions):
-    original_dataset = train_datasets[allele]
-    original_peptides = set(original_dataset.peptides)
-    expanded_allele_affinities = expanded_predictions[allele]
-    extra_affinities = {k: v for (k, v) in expanded_allele_affinities.items() if k not in original_peptides}
-    extra_peptides = list(extra_affinities.keys())
-    extra_values = list(extra_affinities.values())
-    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
-    extra_Y = np.array(extra_values)
-    return extra_X, extra_Y
-
-def WHO_KNOWS_WHAT():
-
-
-    smoothing = 0.005
-    exponent = 2
-    all_predictions = defaultdict(dict)
-    for allele, allele_idx in allele_order.items():
-        for peptide in peptide_order.keys():
-            predictions = reverse_lookups[peptide]
-            total_value = 0.0
-            total_denom = 0.0
-            for (other_allele, y) in predictions:
-                key = (allele,other_allele)
-                if key not in completed_sims_dict:
-                    continue
-                sim = completed_sims_dict[key]
-                weight = sim ** exponent
-                total_value += weight * y
-                total_denom += weight
-            if total_denom > 0.0:
-                all_predictions[allele][peptide] = total_value / (smoothing + total_denom)
-
-
-def data_augmentation(X, Y, extra_X, extra_Y,
-                      fraction=0.5,
-                      niters=10,
-                      extra_sample_weight=0.1,
-                      nb_epoch=50,
-                      nn=True,
-                      hidden_layer_size=5):
-    n = len(Y)
-    aucs = []
-    f1s = []
-    n_originals = []
-    for _ in range(niters):
-        mask = np.random.rand(n) <= fraction
-        X_train = X[mask]
-        X_test = X[~mask]
-        Y_train = Y[mask]
-        Y_test = Y[~mask]
-        test_ic50 = 20000 ** (1-Y_test)
-        test_label = test_ic50 <= 500
-        if test_label.all() or not test_label.any():
-            continue
-        n_original = mask.sum()
-        print("Keeping %d original training samples" % n_original)
-        X_train_combined = np.vstack([X_train, extra_X])
-        Y_train_combined = np.concatenate([Y_train, extra_Y])
-        print("Combined shape: %s" % (X_train_combined.shape,))
-        assert len(X_train_combined) == len(Y_train_combined)
-        # initialize weights to count synthesized and actual data equally
-        # but weight on synthesized points will decay across training epochs
-        weight = np.ones(len(Y_train_combined))
-        if nn:
-            model = mhcflurry.feedforward.make_embedding_network(
-                layer_sizes=[hidden_layer_size],
-                embedding_output_dim=10,
-                activation="tanh")
-            for i in range(nb_epoch):
-                # decay weight as training progresses
-                weight[n_original:] = extra_sample_weight if extra_sample_weight is not None else 1.0 / (i+1)**2
-                model.fit(
-                    X_train_combined,
-                    Y_train_combined,
-                    sample_weight=weight,
-                    shuffle=True,
-                    nb_epoch=1,
-                    verbose=0)
-            pred = model.predict(X_test)
-        else:
-            model = sklearn.linear_model.Ridge(alpha=5)
-            X_train_combined_binary = binary_encode(X_train_combined)
-            X_test_binary = binary_encode(X_test)
-            model.fit(X_train_combined_binary, Y_train_combined, sample_weight=weight)
-            pred = model.predict(X_test_binary)
-        pred_ic50 = 20000 ** (1-pred)
-        pred_label =  pred_ic50 <= 500
-        mse = sklearn.metrics.mean_squared_error(Y_test, pred)
-        auc = sklearn.metrics.roc_auc_score(test_label, pred)
-
-        if pred_label.all() or not pred_label.any():
-            f1 = 0
-        else:
-            f1 = sklearn.metrics.f1_score(test_label, pred_label)
-        print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1))
-        n_originals.append(n_original)
-        aucs.append(auc)
-        f1s.append(f1)
-    return aucs, f1s, n_originals
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    datasets = mhcflurry.data_helpers.load_data(
-        args.binding_data_csv,
-        binary_encoding=True,
-        max_ic50=args.max_ic50)
-
-    sims = {}
-    overlaps = {}
-    weights = {}
-    for allele_name_a, da in train_datasets.items():
-        if len(allele_name_a) != 5:
-            continue
-        y_dict_a = {peptide: y for (peptide,y) in zip(da.peptides, da.Y)}
-        seta = set(da.peptides)
-        for allele_name_b, db in train_datasets.items():
-            if len(allele_name_b) != 5:
-                continue
-            y_dict_b = {peptide: y for (peptide,y) in zip(db.peptides, db.Y)}
-            setb = set(db.peptides)
-            intersection = seta.intersection(setb)
-            overlaps[a,b] = len(intersection)
-            total = 0.0
-            weight = 0.0
-            for peptide in intersection:
-                ya = y_dict_a[peptide]
-                yb = y_dict_b[peptide]
-                minval = min(ya, yb)
-                maxval = max(ya, yb)
-                total += minval
-                weight += maxval
-            weights[a,b] = weight
-            if weight > min_weight:
-                sims[allele_name_a, allele_name_b] = total / weight
-            else:
-                sims[allele_name_a, allele_name_b] = np.nan
-
-    if args.fill_missing_values:
-        sims_array = np.zeros((n_alleles, n_alleles), dtype=float)
-        for i, a in enumerate(allele_names):
-            for j, b in enumerate(allele_names):
-                sims_array[i,j] = sims[(a,b)]
diff --git a/experiments/synthesize-augmented-dataset.py b/experiments/synthesize-augmented-dataset.py
new file mode 100644
index 00000000..12c4e9be
--- /dev/null
+++ b/experiments/synthesize-augmented-dataset.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+import argparse
+
+import numpy as np
+import mhcflurry
+import sklearn.metrics
+import pandas as pd
+
+from dataset_paths import PETERS2009_CSV_PATH
+
+EMBEDDING_DIM_SIZES = [5, 10, 20, 40]
+HIDDEN_LAYER_SIZES = [4, 8, 16, 32, 64, 128, 256]
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--allele-similarity-csv",
+    required=True)
+
+def get_extra_data(allele, train_datasets, expanded_predictions):
+    original_dataset = train_datasets[allele]
+    original_peptides = set(original_dataset.peptides)
+    expanded_allele_affinities = expanded_predictions[allele]
+    extra_affinities = {
+        k: v
+        for (k, v) in expanded_allele_affinities.items()
+        if k not in original_peptides
+    }
+    extra_peptides = list(extra_affinities.keys())
+    extra_values = list(extra_affinities.values())
+    extra_X = mhcflurry.data_helpers.index_encoding(extra_peptides, peptide_length=9)
+    extra_Y = np.array(extra_values)
+    return extra_X, extra_Y
+
+
+def augment_affinity_predictions(
+        sims_dict,
+        peptide_to_affinities,
+        smoothing=0.005,
+        exponent=2.0):
+    all_predictions = defaultdict(dict)
+    for allele in {allele for (allele, _) in sims_dict.keys()}:
+        for peptide, affinities in peptide_to_affinities.items():
+            total = 0.0
+            denom = 0.0
+            for (other_allele, y) in affinities:
+                key = (allele, other_allele)
+                sim = sims_dict.get(key, 0)
+                if sim == 0:
+                    continue
+                weight = sim ** exponent
+                total += weight * y
+                denom += weight
+            if denom > 0.0:
+                all_predictions[allele][peptide] = total / (smoothing + denom)
+    return all_predictions
+
+
+def data_augmentation(
+        X,
+        Y,
+        extra_X,
+        extra_Y,
+        fraction=0.5,
+        niters=10,
+        extra_sample_weight=0.1,
+        nb_epoch=50,
+        nn=True,
+        hidden_layer_size=5,
+        max_ic50=50000.0):
+    n = len(Y)
+    aucs = []
+    f1s = []
+    n_originals = []
+    for _ in range(niters):
+        mask = np.random.rand(n) <= fraction
+        X_train = X[mask]
+        X_test = X[~mask]
+        Y_train = Y[mask]
+        Y_test = Y[~mask]
+        test_ic50 = max_ic50 ** (1 - Y_test)
+        test_label = test_ic50 <= 500
+        if test_label.all() or not test_label.any():
+            continue
+        n_original = mask.sum()
+        print("Keeping %d original training samples" % n_original)
+        X_train_combined = np.vstack([X_train, extra_X])
+        Y_train_combined = np.concatenate([Y_train, extra_Y])
+        print("Combined shape: %s" % (X_train_combined.shape,))
+        assert len(X_train_combined) == len(Y_train_combined)
+        # initialize weights to count synthesized and actual data equally
+        # but weight on synthesized points will decay across training epochs
+        weight = np.ones(len(Y_train_combined))
+        model = mhcflurry.feedforward.make_embedding_network(
+            layer_sizes=[hidden_layer_size],
+            embedding_output_dim=10,
+            activation="tanh")
+        for i in range(nb_epoch):
+            # decay weight as training progresses
+            weight[n_original:] = (
+                extra_sample_weight
+                if extra_sample_weight is not None
+                else 1.0 / (i + 1) ** 2
+            )
+            model.fit(
+                X_train_combined,
+                Y_train_combined,
+                sample_weight=weight,
+                shuffle=True,
+                nb_epoch=1,
+                verbose=0)
+        pred = model.predict(X_test)
+
+        pred_ic50 = max_ic50 ** (1 - pred)
+        pred_label = pred_ic50 <= 500
+        mse = sklearn.metrics.mean_squared_error(Y_test, pred)
+        auc = sklearn.metrics.roc_auc_score(test_label, pred)
+
+        if pred_label.all() or not pred_label.any():
+            f1 = 0
+        else:
+            f1 = sklearn.metrics.f1_score(test_label, pred_label)
+        print("MSE=%0.4f, AUC=%0.4f, F1=%0.4f" % (mse, auc, f1))
+        n_originals.append(n_original)
+        aucs.append(auc)
+        f1s.append(f1)
+    return aucs, f1s, n_originals
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    sims_df = pd.read_csv(args.allele_similarity_csv)
+    df = mhcflurry.data_helpers.load_dataframe(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human)
+    allele_groups = {
+        allele_name: {
+            row["sequence"]: row["regression_output"]
+            for (_, row) in group.iterrows()
+        }
+        for (allele_name, group) in df.groupby("mhc")
+    }
diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index c1a90808..5f7b272d 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -16,12 +16,14 @@ from . import paths
 from . import data_helpers
 from . import feedforward
 from . import common
+from . import fixed_length_peptides
 from .mhc1_binding_predictor import Mhc1BindingPredictor
 
 __all__ = [
     "paths",
     "data_helpers",
     "feedforward",
+    "fixed_length_peptides",
     "common",
     "Mhc1BindingPredictor"
-]
\ No newline at end of file
+]
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index 93337864..8623aa97 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -17,7 +17,6 @@ from __future__ import (
     division,
     absolute_import,
 )
-from .amino_acid import amino_acid_letters
 
 
 def parse_int_list(s):
@@ -64,37 +63,3 @@ def split_allele_names(s):
         for part
         in s.split(",")
     ]
-
-
-def expand_9mer_peptides(peptides, length):
-    """
-    Expand non-9mer peptides using methods from
-       Accurate approximation method for prediction of class I MHC
-       affinities for peptides of length 8, 10 and 11 using prediction
-       tools trained on 9mers.
-    by Lundegaard et. al.
-    http://bioinformatics.oxfordjournals.org/content/24/11/1397
-    """
-    assert len(peptides) > 0
-    if length < 8:
-        raise ValueError("Invalid peptide length: %d (%s)" % (
-            length, peptides[0]))
-    elif length == 9:
-        return peptides
-    elif length == 8:
-        # extend each peptide by inserting every possible amino acid
-        # between base-1 positions 4-8
-        return [
-            peptide[:i] + extra_amino_acid + peptide[i:]
-            for peptide in peptides
-            for i in range(3, 8)
-            for extra_amino_acid in amino_acid_letters
-        ]
-    else:
-        # drop interior residues between base-1 positions 4 to last
-        n_skip = length - 9
-        return [
-            peptide[:i] + peptide[i + n_skip:]
-            for peptide in peptides
-            for i in range(3, 9)
-        ]
diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data_helpers.py
index 9fd8834d..be40e8c6 100644
--- a/mhcflurry/data_helpers.py
+++ b/mhcflurry/data_helpers.py
@@ -171,6 +171,45 @@ def load_dataframe(
     return df
 
 
+def load_allele_dicts(
+        filename,
+        max_ic50=50000.0,
+        regression_output=False,
+        sep=None,
+        species_column_name="species",
+        allele_column_name="mhc",
+        peptide_column_name=None,
+        peptide_length_column_name="peptide_length",
+        ic50_column_name="meas",
+        only_human=True):
+    """
+    Parsing CSV of binding data into dictionary of dictionaries.
+    The outer key is an allele name, the inner key is a peptide sequence,
+    and the inner value is an IC50 or log-transformed value between [0,1]
+    """
+    binding_df = load_dataframe(
+        filename=filename,
+        max_ic50=max_ic50,
+        sep=sep,
+        species_column_name=species_column_name,
+        allele_column_name=allele_column_name,
+        peptide_column_name=peptide_column_name,
+        peptide_length_column_name=peptide_length_column_name,
+        ic50_column_name=ic50_column_name,
+        only_human=only_human)
+    return {
+        allele_name: {
+            row[peptide_column_name]: (
+                row["regression_output"]
+                if regression_output
+                else row[ic50_column_name]
+            )
+            for (_, row) in group.iterrows()
+        }
+        for (allele_name, group) in binding_df.groupby(allele_column_name)
+    }
+
+
 def load_allele_datasets(
         filename,
         peptide_length=9,
diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py
new file mode 100644
index 00000000..d5ef0a69
--- /dev/null
+++ b/mhcflurry/fixed_length_peptides.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+
+from .amino_acid import amino_acid_letters
+
+
+def all_kmers(k, alphabet=amino_acid_letters):
+    """
+    Generates all k-mer peptide sequences
+    """
+    return [
+        "".join(combination)
+        for combination
+        in itertools.product(list(alphabet) * k)
+    ]
+
+
+def extend_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        alphabet=amino_acid_letters):
+    """Extend peptide by inserting every possible amino acid combination
+    if we're trying to e.g. turn an 8mer into 9mers
+    """
+    n = len(peptide)
+    assert n < desired_length
+    n_missing = desired_length - n
+    if n_missing > 3:
+        raise ValueError(
+            "Cannot transform %s of length %d into a %d-mer peptide" % (
+                peptide, n, desired_length))
+    return [
+        peptide[:i] + extra + peptide[i:]
+        for i in range(start_offset, n - end_offset)
+        for extra in all_kmers(n_missing, alphabet=alphabet)
+    ]
+
+
+def shorten_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        alphabet=amino_acid_letters):
+    """Shorten peptide if trying to e.g. turn 10mer into 9mers"""
+    n = len(peptide)
+    assert n > desired_length
+    n_skip = n - desired_length
+    assert n_skip > 0, \
+        "Expected length of peptide %s %d to be greater than %d" % (
+            peptide, n, desired_length)
+    end_range = n - end_offset - n_skip + 1
+    return [
+        peptide[:i] + peptide[i + n_skip:]
+        for i in range(start_offset, end_range)
+    ]
+
+
+def fixed_length_from_single_peptide(
+        peptide,
+        desired_length,
+        start_offset_extend=2,
+        end_offset_extend=1,
+        start_offset_shorten=2,
+        end_offset_shorten=0,
+        alphabet=amino_acid_letters):
+    """
+    Create a set of fixed-length k-mer peptides from a single peptide of any
+    length. Shorter peptides are filled in using all possible amino acids at any
+    insertion site between (start_offset, -end_offset).
+
+    We can recreate the methods from:
+       Accurate approximation method for prediction of class I MHC
+       affinities for peptides of length 8, 10 and 11 using prediction
+       tools trained on 9mers.
+    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
+    with the following settings:
+        - desired_length = 9
+        - start_offset_extend = 2
+        - end_offset_extend = 1
+        - start_offset_shorten = 2
+        - end_offset_shorten = 0
+    """
+    n = len(peptide)
+    if n == desired_length:
+        return [peptide]
+    elif n < desired_length:
+        return extend_peptide(
+            peptide=peptide,
+            desired_length=desired_length,
+            start_offset=start_offset_extend,
+            end_offset=end_offset_extend,
+            alphabet=alphabet)
+    else:
+        return shorten_peptide(
+            peptide=peptide,
+            desired_length=desired_length,
+            start_offset=start_offset_shorten,
+            end_offset=end_offset_shorten,
+            alphabet=alphabet)
+
+
+def fixed_length_from_many_peptides(
+        peptides,
+        desired_length,
+        start_offset_extend=2,
+        end_offset_extend=1,
+        start_offset_shorten=2,
+        end_offset_shorten=0,
+        alphabet=amino_acid_letters):
+    """
+    Create a set of fixed-length k-mer peptides from a collection of varying
+    length peptides. Shorter peptides are filled in using all possible amino
+    acids at any insertion site between
+        [start_offset_extend, length - end_offset_extend).
+    Longer peptides are made smaller by deleting contiguous residues between
+        [start_offset_shorten, length - end_offset_shorten)
+
+    We can recreate the methods from:
+       Accurate approximation method for prediction of class I MHC
+       affinities for peptides of length 8, 10 and 11 using prediction
+       tools trained on 9mers.
+    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
+    with the following settings:
+        - desired_length = 9
+        - start_offset_extend = 2
+        - end_offset_extend = 1
+        - start_offset_shorten = 2
+        - end_offset_shorten = 0
+
+    Returns three lists:
+        - a list of fixed length peptides (all of length `desired_length`)
+        - a list of the original peptides from which subsequences were
+          contracted or lengthened
+        - a list of integers indicating the number of fixed length peptides
+         generated from each variable length peptide.
+
+    Example:
+        kmers, original, counts = fixed_length_from_many_peptides(
+            peptides=["ABC", "A"]
+            desired_length=2,
+            start_offset_extend=0,
+            end_offset_extend=0,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            alphabet="ABC")
+        kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
+        original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
+        counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
+    """
+    fixed_length_peptides = []
+    original_peptide_sequences = []
+    number_expanded = []
+    for peptide in peptides:
+        fixed_length_peptides = fixed_length_from_single_peptide(
+            peptide,
+            desired_length=desired_length,
+            start_offset_extend=start_offset_extend,
+            end_offset_extend=end_offset_extend,
+            start_offset_shorten=start_offset_shorten,
+            end_offset_shorten=end_offset_shorten,
+            alphabet=alphabet)
+        n_fixed_length = len(fixed_length_peptides)
+        fixed_length_peptides.extend(fixed_length_peptides)
+        original_peptide_sequences.extend([peptide] * n_fixed_length)
+        number_expanded.extend([n_fixed_length] * n_fixed_length)
+    return fixed_length_peptides, original_peptide_sequences, number_expanded
diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py
index ffde0cbe..b037b2d6 100644
--- a/mhcflurry/mhc1_binding_predictor.py
+++ b/mhcflurry/mhc1_binding_predictor.py
@@ -32,7 +32,7 @@ from keras.models import model_from_config
 from .class1_allele_specific_hyperparameters import MAX_IC50
 from .data_helpers import index_encoding, normalize_allele_name
 from .paths import CLASS1_MODEL_DIRECTORY
-from .common import expand_9mer_peptides
+from .fixed_length_peptides import fixed_length_from_many_peptides
 
 _allele_model_cache = {}
 
@@ -111,8 +111,9 @@ class Mhc1BindingPredictor(object):
             results[column_name] = []
 
         for length, group_peptides in groupby(peptides, lambda x: len(x)):
-            group_peptides = list(group_peptides)
-            expanded_peptides = expand_9mer_peptides(group_peptides, length)
+            expanded_peptides, _, _ = fixed_length_from_many_peptides(
+                peptides=list(group_peptides),
+                desired_length=length)
             n_group = len(group_peptides)
             n_expanded = len(expanded_peptides)
             expansion_factor = int(n_expanded / n_group)
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
new file mode 100644
index 00000000..db20c8ea
--- /dev/null
+++ b/mhcflurry/peptide_encoding.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from .amino_acid import amino_acid_letter_indices
+
+
+def hotshot_encoding(peptides, peptide_length):
+    """
+    Encode a set of equal length peptides as a binary matrix,
+    where each letter is transformed into a length 20 vector with a single
+    element that is 1 (and the others are 0).
+    """
+    shape = (len(peptides), peptide_length, 20)
+    X = np.zeros(shape, dtype=bool)
+    for i, peptide in enumerate(peptides):
+        for j, amino_acid in enumerate(peptide):
+            k = amino_acid_letter_indices[amino_acid]
+            X[i, j, k] = 1
+    return X
+
+
+def index_encoding(peptides, peptide_length):
+    """
+    Encode a set of equal length peptides as a vector of their
+    amino acid indices.
+    """
+    X = np.zeros((len(peptides), peptide_length), dtype=int)
+    for i, peptide in enumerate(peptides):
+        for j, amino_acid in enumerate(peptide):
+            X[i, j] = amino_acid_letter_indices[amino_acid]
+    return X
+
+
+def index_encoding_of_substrings(
+        peptides,
+        substring_length,
+        delete_exclude_start=0,
+        delete_exclude_end=0):
+    """
+    Take peptides of varying lengths, chop them into substrings of fixed
+    length and apply index encoding to these substrings.
+
+    If a string is longer than the substring length, then it's reduced to
+    the desired length by deleting characters at all possible positions.
+    If positions at the start or end of a string should be exempt from deletion
+    then the number of exempt characters can be controlled via
+    `delete_exclude_start` and `delete_exclude_end`.
+
+    Returns feature matrix X and a vector of substring counts.
+    """
+    pass
+
+
+def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
+    """
+    Given an (n_samples, peptide_length) integer matrix
+    convert it to a binary encoding of shape:
+        (n_samples, peptide_length * n_indices)
+    """
+    (n_samples, peptide_length) = X.shape
+    if not n_indices:
+        n_indices = X.max() - first_index_value + 1
+
+    X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
+    for i, row in enumerate(X):
+        for j, xij in enumerate(row):
+            X_binary[i, n_indices * j + xij - first_index_value] = 1
+    return X_binary.astype(float)
-- 
GitLab