From 6d95f5018ebc7e985013e3b8aa4f900fb6d3105b Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 30 Nov 2015 14:16:06 -0600
Subject: [PATCH] lots of fixes to multi-length peptide code and evaluation of
 synthetic data

---
 ....py => best-synthetic-data-hyperparams.py} | 119 ++++++++++++------
 experiments/synthetic_data.py                 |   2 +
 mhcflurry/data.py                             |  61 +++++++--
 mhcflurry/fixed_length_peptides.py            |   6 +
 4 files changed, 139 insertions(+), 49 deletions(-)
 rename experiments/{best-synthetic-data-smoothing-coef.py => best-synthetic-data-hyperparams.py} (59%)

diff --git a/experiments/best-synthetic-data-smoothing-coef.py b/experiments/best-synthetic-data-hyperparams.py
similarity index 59%
rename from experiments/best-synthetic-data-smoothing-coef.py
rename to experiments/best-synthetic-data-hyperparams.py
index 9d235305..cd19b22a 100644
--- a/experiments/best-synthetic-data-smoothing-coef.py
+++ b/experiments/best-synthetic-data-hyperparams.py
@@ -29,6 +29,7 @@ import argparse
 import mhcflurry
 from scipy import stats
 import numpy as np
+import sklearn.metrics
 
 from common import curry_dictionary
 from dataset_paths import PETERS2009_CSV_PATH
@@ -64,74 +65,79 @@ parser.add_argument(
 
 parser.add_argument(
     "--smoothing-coefs",
-    default=[10.0 ** -power for power in np.arange(0, 5.0, 0.25)],
+    default=[0.1, 0.025, 0.05, 0.01, 0.0025, 0.005, 0.001, 0.0005, 0.0001],
     type=lambda s: [float(si.strip()) for si in s.split(",")],
     help="Smoothing value used for peptides with low weight across alleles")
 
 parser.add_argument(
-    "--similarity-exponent",
-    default=2.0,
-    type=float,
+    "--similarity-exponents",
+    default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
     help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
 
 
-def evaluate_smoothing_coef(
+def evaluate_synthetic_data(
         true_data,
         curried_allele_similarities,
-        smoothing_coef):
+        smoothing_coef,
+        exponent,
+        max_ic50):
     taus = []
+    f1_scores = []
+    aucs = []
     peptide_to_affinities = create_reverse_peptide_affinity_lookup_dict(
         true_data)
     for allele, dataset in true_data.items():
-        allele_similarities = curried_allele_similarities[allele]
-        true_data_peptide_set = set(dataset.peptides)
-        true_data_peptide_list = list(dataset.peptides)
+        this_allele_similarities = curried_allele_similarities[allele]
+        this_allele_peptides = set(dataset.peptides)
         # create a peptide -> (allele, affinity, weight) dictionary restricted
         # only to the peptides for which we have data for this allele
         restricted_reverse_lookup = {
             peptide: triplet
             for (peptide, triplet) in peptide_to_affinities.items()
-            if peptide in true_data_peptide_set
+            if peptide in this_allele_peptides
         }
         synthetic_values = synthesize_affinities_for_single_allele(
-            similarities=allele_similarities,
+            similarities=this_allele_similarities,
             peptide_to_affinities=restricted_reverse_lookup,
             smoothing=smoothing_coef,
-            exponent=2.0,
+            exponent=exponent,
             exclude_alleles=[allele])
+
         synthetic_peptide_set = set(synthetic_values.keys())
-        # ordered list of peptides for which we have both true and synthetic
-        # affinity values
-        combined_peptide_list = [
+        # set of peptides for which we have both true and synthetic
+        # affinity values and for which the "true" values were not derived
+        # from elongating or shortening the sequence of another sample
+        combined_peptide_set = {
             peptide
-            for peptide in true_data_peptide_list
-            if peptide in synthetic_peptide_set
-        ]
-
+            for (peptide, original_peptide)
+            in zip(dataset.peptides, dataset.original_peptides)
+            if peptide in synthetic_peptide_set and peptide == original_peptide
+        }
+        combined_peptide_list = list(sorted(combined_peptide_set))
         if len(combined_peptide_list) < 2:
-            print(
-                "-- Skipping evaluation of %s due to insufficient data" % (
-                    allele,))
             continue
 
         synthetic_affinity_list = [
             synthetic_values[peptide]
             for peptide in combined_peptide_list
         ]
+
+        assert len(dataset.peptides) == len(dataset.Y), \
+            "Mismatch between # of peptides %d and # of outputs %d" % (
+                len(dataset.peptides), len(dataset.Y))
         true_affinity_dict = {
             peptide: yi
             for (peptide, yi)
             in zip(dataset.peptides, dataset.Y)
         }
+
         true_affinity_list = [
             true_affinity_dict[peptide]
             for peptide in combined_peptide_list
         ]
         assert len(true_affinity_list) == len(synthetic_affinity_list)
         if all(x == true_affinity_list[0] for x in true_affinity_list):
-            print(
-                "-- can't compute Kendall's tau for %s, all affinities same" % (
-                    allele,))
             continue
 
         tau, _ = stats.kendalltau(
@@ -139,7 +145,27 @@ def evaluate_smoothing_coef(
             true_affinity_list)
         assert not np.isnan(tau)
         taus.append(tau)
-    return np.median(taus)
+
+        true_ic50s = max_ic50 ** np.array(true_affinity_list)
+        predicted_ic50s = max_ic50 ** np.array(synthetic_affinity_list)
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        aucs.append(auc)
+
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        f1_scores.append(f1_score)
+    return taus, aucs, f1_scores
 
 
 def create_curried_similarity_matrix(allele_to_peptide_to_affinity, min_weight=2.0):
@@ -177,21 +203,40 @@ if __name__ == "__main__":
         for (allele, dataset)
         in allele_datasets.items()
     }
-    curried_sims_dict = create_curried_similarity_matrix(allele_to_peptide_to_affinity)
+    curried_sims_dict = create_curried_similarity_matrix(
+        allele_to_peptide_to_affinity)
 
     print("Generated similarities between %d alleles" % len(curried_sims_dict))
 
     results = {}
     for smoothing_coef in args.smoothing_coefs:
-        median_tau = evaluate_smoothing_coef(
-            true_data=allele_datasets,
-            curried_allele_similarities=curried_sims_dict,
-            smoothing_coef=smoothing_coef)
-        print("Coef=%f, median Kendall tau=%f" % (
-            smoothing_coef,
-            median_tau))
-        results[smoothing_coef] = median_tau
+        for exponent in args.similarity_exponents:
+            taus, aucs, f1_scores = evaluate_synthetic_data(
+                true_data=allele_datasets,
+                curried_allele_similarities=curried_sims_dict,
+                smoothing_coef=smoothing_coef,
+                exponent=exponent,
+                max_ic50=args.max_ic50)
+            median_tau = np.median(taus)
+            median_f1 = np.median(f1_scores)
+            median_auc = np.median(aucs)
+            print(
+                "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
+                    exponent,
+                    smoothing_coef,
+                    median_tau,
+                    median_auc,
+                    median_f1))
+            scores = (median_tau, median_auc, median_f1)
+            results[(exponent, smoothing_coef)] = scores
 
     print("===")
-    (best_coef, best_tau) = max(results.items(), key=lambda x: x[1])
-    print("Best coef = %f (tau = %f)" % (best_coef, best_tau))
+    ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
+        results.items(),
+        key=lambda x: x[1][0] * x[1][1] * x[1][2])
+    print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+        best_exponent,
+        best_coef,
+        median_tau,
+        median_auc,
+        median_f1))
diff --git a/experiments/synthetic_data.py b/experiments/synthetic_data.py
index 54d020fc..dd2b5335 100644
--- a/experiments/synthetic_data.py
+++ b/experiments/synthetic_data.py
@@ -93,6 +93,8 @@ def synthesize_affinities_for_single_allele(
         total = 0.0
         denom = 0.0
         for (allele, y, sample_weight) in affinities:
+            if allele in exclude_alleles:
+                continue
             sim = similarities.get(allele, 0)
             if sim == 0:
                 continue
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index 66ccbe3c..998fadd0 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -24,9 +24,12 @@ import numpy as np
 
 from .common import normalize_allele_name
 from .peptide_encoding import (
-    fixed_length_index_encoding,
+    index_encoding,
     indices_to_hotshot_encoding,
 )
+from .fixed_length_peptides import (
+    fixed_length_from_many_peptides
+)
 
 AlleleData = namedtuple(
     "AlleleData",
@@ -266,21 +269,44 @@ def load_allele_datasets(
         # convert from a Pandas column to a list, since that's easier to
         # interact with later
         raw_peptides = list(raw_peptides)
-        # convert numberical values from a Pandas column to arrays
-        ic50 = np.array(group[ic50_column_name])
-        Y = np.array(group["regression_output"])
+        # create dictionaries of outputs from which we can look up values
+        # after peptides have been expanded
+        ic50_dict = {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(raw_peptides, group[ic50_column_name])
+        }
+        Y_dict = {
+            peptide: y
+            for (peptide, y)
+            in zip(raw_peptides, group["regression_output"])
+        }
 
-        X_index, original_peptides, counts = fixed_length_index_encoding(
-            peptides=raw_peptides,
-            desired_length=peptide_length)
+        fixed_length_peptides, original_peptides, subsequence_counts = \
+            fixed_length_from_many_peptides(
+                peptides=raw_peptides,
+                desired_length=peptide_length,
+                start_offset_shorten=0,
+                end_offset_shorten=0,
+                start_offset_extend=0,
+                end_offset_extend=0)
+        n_samples = len(fixed_length_peptides)
+        assert n_samples == len(original_peptides), \
+            "Mismatch between # of samples (%d) and # of peptides (%d)" % (
+                n_samples, len(original_peptides))
+        assert n_samples == len(subsequence_counts), \
+            "Mismatch between # of samples (%d) and # of counts (%d)" % (
+                n_samples, len(subsequence_counts))
+
+        X_index = index_encoding(fixed_length_peptides, peptide_length)
 
         X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+
         assert X_binary.shape[0] == X_index.shape[0], \
             ("Mismatch between number of samples for index encoding (%d)"
              " vs. binary encoding (%d)") % (
                 X_binary.shape[0],
                 X_index.shape[0])
-        n_samples = X_binary.shape[0]
 
         if flatten_binary_encoding:
             # collapse 3D input into 2D matrix
@@ -288,16 +314,27 @@ def load_allele_datasets(
             X_binary = X_binary.reshape((n_samples, n_binary_features))
 
         # easier to work with counts when they're an array instead of list
-        counts = np.array(counts)
+        subsequence_counts = np.array(subsequence_counts)
+
+        Y = np.array([Y_dict[p] for p in original_peptides])
+        ic50 = np.array([ic50_dict[p] for p in original_peptides])
+
+        assert n_samples == len(Y), \
+            "Mismatch between # peptides %d and # regression outputs %d" % (
+                n_samples, len(Y))
+
+        assert n_samples == len(ic50), \
+            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+                n_samples, len(ic50))
 
         allele_groups[allele] = AlleleData(
             X_index=X_index,
             X_binary=X_binary,
             Y=Y,
             ic50=ic50,
-            peptides=raw_peptides,
+            peptides=fixed_length_peptides,
             original_peptides=original_peptides,
             original_lengths=[len(peptide) for peptide in original_peptides],
-            substring_counts=counts,
-            weights=1.0 / counts)
+            substring_counts=subsequence_counts,
+            weights=1.0 / subsequence_counts)
     return allele_groups
diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py
index edaa4548..1ab37bdf 100644
--- a/mhcflurry/fixed_length_peptides.py
+++ b/mhcflurry/fixed_length_peptides.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 import itertools
 import logging
 
@@ -217,6 +222,7 @@ def fixed_length_from_many_peptides(
                 start_offset=start_offset_shorten,
                 end_offset=end_offset_shorten,
                 alphabet=alphabet)
+
         n_fixed_length = len(fixed_length_peptides)
         all_fixed_length_peptides.extend(fixed_length_peptides)
         original_peptide_sequences.extend([peptide] * n_fixed_length)
-- 
GitLab