lots of fixes to multi-length peptide code and evaluation of synthetic data

6d95f501 · Alex Rubinsteyn · 07cba541 · 6d95f501 · 6d95f501 · 6d95f501
Commit 6d95f501 authored 9 years ago by Alex Rubinsteyn
--- a/experiments/best-synthetic-data-smoothing-coef.py
+++ b/experiments/best-synthetic-data-smoothing-coef.py
@@ -29,6 +29,7 @@ import argparse
 import mhcflurry
 from scipy import stats
 import numpy as np
+import sklearn.metrics
 from common import curry_dictionary
 from dataset_paths import PETERS2009_CSV_PATH
@@ -64,74 +65,79 @@ parser.add_argument(
 parser.add_argument(
    "--smoothing-coefs",
-    default=[10.0 ** -power for power in np.arange(0, 5.0, 0.25)],
+    default=[0.1, 0.025, 0.05, 0.01, 0.0025, 0.005, 0.001, 0.0005, 0.0001],
    type=lambda s: [float(si.strip()) for si in s.split(",")],
    help="Smoothing value used for peptides with low weight across alleles")
 parser.add_argument(
-    "--similarity-exponent",
+    "--similarity-exponents",
-    default=2.0,
+    default=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-    type=float,
+    type=lambda s: [float(si.strip()) for si in s.split(",")],
    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
-def evaluate_smoothing_coef(
+def evaluate_synthetic_data(
        true_data,
        curried_allele_similarities,
-        smoothing_coef):
+        smoothing_coef,
+        exponent,
+        max_ic50):
    taus = []
+    f1_scores = []
+    aucs = []
    peptide_to_affinities = create_reverse_peptide_affinity_lookup_dict(
        true_data)
    for allele, dataset in true_data.items():
-        allele_similarities = curried_allele_similarities[allele]
+        this_allele_similarities = curried_allele_similarities[allele]
-        true_data_peptide_set = set(dataset.peptides)
+        this_allele_peptides = set(dataset.peptides)
-        true_data_peptide_list = list(dataset.peptides)
        # create a peptide -> (allele, affinity, weight) dictionary restricted
        # only to the peptides for which we have data for this allele
        restricted_reverse_lookup = {
            peptide: triplet
            for (peptide, triplet) in peptide_to_affinities.items()
-            if peptide in true_data_peptide_set
+            if peptide in this_allele_peptides
        }
        synthetic_values = synthesize_affinities_for_single_allele(
-            similarities=allele_similarities,
+            similarities=this_allele_similarities,
            peptide_to_affinities=restricted_reverse_lookup,
            smoothing=smoothing_coef,
-            exponent=2.0,
+            exponent=exponent,
            exclude_alleles=[allele])
        synthetic_peptide_set = set(synthetic_values.keys())
-        # ordered list of peptides for which we have both true and synthetic
+        # set of peptides for which we have both true and synthetic
-        # affinity values
+        # affinity values and for which the "true" values were not derived
-        combined_peptide_list = [
+        # from elongating or shortening the sequence of another sample
+        combined_peptide_set = {
            peptide
-            for peptide in true_data_peptide_list
+            for (peptide, original_peptide)
-            if peptide in synthetic_peptide_set
+            in zip(dataset.peptides, dataset.original_peptides)
-        ]
+            if peptide in synthetic_peptide_set and peptide == original_peptide
+        }
+        combined_peptide_list = list(sorted(combined_peptide_set))
        if len(combined_peptide_list) < 2:
-            print(
-                "-- Skipping evaluation of %s due to insufficient data" % (
-                    allele,))
            continue
        synthetic_affinity_list = [
            synthetic_values[peptide]
            for peptide in combined_peptide_list
        ]
+        assert len(dataset.peptides) == len(dataset.Y), \
+            "Mismatch between # of peptides %d and # of outputs %d" % (
+                len(dataset.peptides), len(dataset.Y))
        true_affinity_dict = {
            peptide: yi
            for (peptide, yi)
            in zip(dataset.peptides, dataset.Y)
        }
        true_affinity_list = [
            true_affinity_dict[peptide]
            for peptide in combined_peptide_list
        ]
        assert len(true_affinity_list) == len(synthetic_affinity_list)
        if all(x == true_affinity_list[0] for x in true_affinity_list):
-            print(
-                "-- can't compute Kendall's tau for %s, all affinities same" % (
-                    allele,))
            continue
        tau, _ = stats.kendalltau(
@@ -139,7 +145,27 @@ def evaluate_smoothing_coef(
            true_affinity_list)
        assert not np.isnan(tau)
        taus.append(tau)
-    return np.median(taus)
+        true_ic50s = max_ic50 ** np.array(true_affinity_list)
+        predicted_ic50s = max_ic50 ** np.array(synthetic_affinity_list)
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        aucs.append(auc)
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        f1_scores.append(f1_score)
+    return taus, aucs, f1_scores
 def create_curried_similarity_matrix(allele_to_peptide_to_affinity, min_weight=2.0):
@@ -177,21 +203,40 @@ if __name__ == "__main__":
        for (allele, dataset)
        in allele_datasets.items()
    }
-    curried_sims_dict = create_curried_similarity_matrix(allele_to_peptide_to_affinity)
+    curried_sims_dict = create_curried_similarity_matrix(
+        allele_to_peptide_to_affinity)
    print("Generated similarities between %d alleles" % len(curried_sims_dict))
    results = {}
    for smoothing_coef in args.smoothing_coefs:
-        median_tau = evaluate_smoothing_coef(
+        for exponent in args.similarity_exponents:
-            true_data=allele_datasets,
+            taus, aucs, f1_scores = evaluate_synthetic_data(
-            curried_allele_similarities=curried_sims_dict,
+                true_data=allele_datasets,
-            smoothing_coef=smoothing_coef)
+                curried_allele_similarities=curried_sims_dict,
-        print("Coef=%f, median Kendall tau=%f" % (
+                smoothing_coef=smoothing_coef,
-            smoothing_coef,
+                exponent=exponent,
-            median_tau))
+                max_ic50=args.max_ic50)
-        results[smoothing_coef] = median_tau
+            median_tau = np.median(taus)
+            median_f1 = np.median(f1_scores)
+            median_auc = np.median(aucs)
+            print(
+                "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
+                    exponent,
+                    smoothing_coef,
+                    median_tau,
+                    median_auc,
+                    median_f1))
+            scores = (median_tau, median_auc, median_f1)
+            results[(exponent, smoothing_coef)] = scores
    print("===")
-    (best_coef, best_tau) = max(results.items(), key=lambda x: x[1])
+    ((best_exponent, best_coef), (median_tau, median_auc, median_f1)) = max(
-    print("Best coef = %f (tau = %f)" % (best_coef, best_tau))
+        results.items(),
+        key=lambda x: x[1][0] * x[1][1] * x[1][2])
+    print("Best exponent = %f, coef = %f (tau=%0.4f, AUC=%0.4f, F1=%0.4f)" % (
+        best_exponent,
+        best_coef,
+        median_tau,
+        median_auc,
+        median_f1))
--- a/experiments/synthetic_data.py
+++ b/experiments/synthetic_data.py
@@ -93,6 +93,8 @@ def synthesize_affinities_for_single_allele(
        total = 0.0
        denom = 0.0
        for (allele, y, sample_weight) in affinities:
+            if allele in exclude_alleles:
+                continue
            sim = similarities.get(allele, 0)
            if sim == 0:
                continue

--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -24,9 +24,12 @@ import numpy as np
 from .common import normalize_allele_name
 from .peptide_encoding import (
-    fixed_length_index_encoding,
+    index_encoding,
    indices_to_hotshot_encoding,
 )
+from .fixed_length_peptides import (
+    fixed_length_from_many_peptides
+)
 AlleleData = namedtuple(
    "AlleleData",
@@ -266,21 +269,44 @@ def load_allele_datasets(
        # convert from a Pandas column to a list, since that's easier to
        # interact with later
        raw_peptides = list(raw_peptides)
-        # convert numberical values from a Pandas column to arrays
+        # create dictionaries of outputs from which we can look up values
-        ic50 = np.array(group[ic50_column_name])
+        # after peptides have been expanded
-        Y = np.array(group["regression_output"])
+        ic50_dict = {
+            peptide: ic50
+            for (peptide, ic50)
+            in zip(raw_peptides, group[ic50_column_name])
+        }
+        Y_dict = {
+            peptide: y
+            for (peptide, y)
+            in zip(raw_peptides, group["regression_output"])
+        }
-        X_index, original_peptides, counts = fixed_length_index_encoding(
+        fixed_length_peptides, original_peptides, subsequence_counts = \
-            peptides=raw_peptides,
+            fixed_length_from_many_peptides(
-            desired_length=peptide_length)
+                peptides=raw_peptides,
+                desired_length=peptide_length,
+                start_offset_shorten=0,
+                end_offset_shorten=0,
+                start_offset_extend=0,
+                end_offset_extend=0)
+        n_samples = len(fixed_length_peptides)
+        assert n_samples == len(original_peptides), \
+            "Mismatch between # of samples (%d) and # of peptides (%d)" % (
+                n_samples, len(original_peptides))
+        assert n_samples == len(subsequence_counts), \
+            "Mismatch between # of samples (%d) and # of counts (%d)" % (
+                n_samples, len(subsequence_counts))
+        X_index = index_encoding(fixed_length_peptides, peptide_length)
        X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
        assert X_binary.shape[0] == X_index.shape[0], \
            ("Mismatch between number of samples for index encoding (%d)"
             " vs. binary encoding (%d)") % (
                X_binary.shape[0],
                X_index.shape[0])
-        n_samples = X_binary.shape[0]
        if flatten_binary_encoding:
            # collapse 3D input into 2D matrix
@@ -288,16 +314,27 @@ def load_allele_datasets(
            X_binary = X_binary.reshape((n_samples, n_binary_features))
        # easier to work with counts when they're an array instead of list
-        counts = np.array(counts)
+        subsequence_counts = np.array(subsequence_counts)
+        Y = np.array([Y_dict[p] for p in original_peptides])
+        ic50 = np.array([ic50_dict[p] for p in original_peptides])
+        assert n_samples == len(Y), \
+            "Mismatch between # peptides %d and # regression outputs %d" % (
+                n_samples, len(Y))
+        assert n_samples == len(ic50), \
+            "Mismatch between # of peptides %d and # IC50 outputs %d" % (
+                n_samples, len(ic50))
        allele_groups[allele] = AlleleData(
            X_index=X_index,
            X_binary=X_binary,
            Y=Y,
            ic50=ic50,
-            peptides=raw_peptides,
+            peptides=fixed_length_peptides,
            original_peptides=original_peptides,
            original_lengths=[len(peptide) for peptide in original_peptides],
-            substring_counts=counts,
+            substring_counts=subsequence_counts,
-            weights=1.0 / counts)
+            weights=1.0 / subsequence_counts)
    return allele_groups
--- a/mhcflurry/fixed_length_peptides.py
+++ b/mhcflurry/fixed_length_peptides.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
 import itertools
 import logging
@@ -217,6 +222,7 @@ def fixed_length_from_many_peptides(
                start_offset=start_offset_shorten,
                end_offset=end_offset_shorten,
                alphabet=alphabet)
        n_fixed_length = len(fixed_length_peptides)
        all_fixed_length_peptides.extend(fixed_length_peptides)
        original_peptide_sequences.extend([peptide] * n_fixed_length)