Skip to content
Snippets Groups Projects
test_class1_presentation_predictor.py 27.4 KiB
Newer Older
"""

Idea:

- take an allele where MS vs. no-MS trained predictors are very different. One
    possiblility is DLA-88*501:01 but human would be better
- generate synethetic multi-allele MS by combining single-allele MS for differnet
   alleles, including the selected allele
Tim O'Donnell's avatar
Tim O'Donnell committed
- train presentation predictor based on the no-ms pan-allele models on theis
  synthetic dataset
- see if the pan-allele predictor learns the "correct" motif for the selected
  allele, i.e. updates to become more similar to the with-ms pan allele predictor.


"""

Tim O'Donnell's avatar
Tim O'Donnell committed
import logging
logging.getLogger('tensorflow').disabled = True
logging.getLogger('matplotlib').disabled = True

import pandas
import argparse
import sys
Tim O'Donnell's avatar
Tim O'Donnell committed
import copy
Tim O'Donnell's avatar
Tim O'Donnell committed
import os
Tim O'Donnell's avatar
Tim O'Donnell committed
from numpy.testing import assert_, assert_equal, assert_allclose
Tim O'Donnell's avatar
Tim O'Donnell committed
from nose.tools import assert_greater, assert_less
import numpy
from random import shuffle

Tim O'Donnell's avatar
Tim O'Donnell committed
from sklearn.metrics import roc_auc_score

Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.allele_encoding import MultipleAlleleEncoding
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.downloads import get_path
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.regression_target import from_ic50
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.common import random_peptides, positional_frequency_matrix
from mhcflurry.testing_utils import cleanup, startup
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
Tim O'Donnell's avatar
Tim O'Donnell committed
from mhcflurry.custom_loss import MultiallelicMassSpecLoss

COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)

PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None
PAN_ALLELE_MOTIFS_WITH_MASS_SPEC_DF = None
PAN_ALLELE_MOTIFS_NO_MASS_SPEC_DF = None

Tim O'Donnell's avatar
Tim O'Donnell committed
def data_path(name):
    '''
    Return the absolute path to a file in the test/data directory.
    The name specified should be relative to test/data.
    '''
    return os.path.join(os.path.dirname(__file__), "data", name)


def setup():
    global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC
    global PAN_ALLELE_MOTIFS_WITH_MASS_SPEC_DF
    global PAN_ALLELE_MOTIFS_NO_MASS_SPEC_DF
    startup()
    PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = Class1AffinityPredictor.load(
Tim O'Donnell's avatar
Tim O'Donnell committed
        get_path("models_class1_pan", "models.no_mass_spec"),
        optimization_level=0,
        max_models=1)

    PAN_ALLELE_MOTIFS_WITH_MASS_SPEC_DF = pandas.read_csv(
        get_path(
            "models_class1_pan",
            "models.with_mass_spec/frequency_matrices.csv.bz2"))
    PAN_ALLELE_MOTIFS_NO_MASS_SPEC_DF = pandas.read_csv(
        get_path(
            "models_class1_pan",
            "models.no_mass_spec/frequency_matrices.csv.bz2"))


def teardown():
    global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC
    global PAN_ALLELE_MOTIFS_WITH_MASS_SPEC_DF
    global PAN_ALLELE_MOTIFS_NO_MASS_SPEC_DF

    PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None
    PAN_ALLELE_MOTIFS_WITH_MASS_SPEC_DF = None
    PAN_ALLELE_MOTIFS_NO_MASS_SPEC_DF = None
    cleanup()


def scramble_peptide(peptide):
    lst = list(peptide)
    shuffle(lst)
    return "".join(lst)


Tim O'Donnell's avatar
Tim O'Donnell committed
def evaluate_loss(loss, y_true, y_pred):
    import keras.backend as K

    y_true = numpy.array(y_true)
    y_pred = numpy.array(y_pred)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape((len(y_pred), 1))
    if y_true.ndim == 1:
        y_true = y_true.reshape((len(y_true), 1))

    if K.backend() == "tensorflow":
        session = K.get_session()
        y_true_var = K.constant(y_true, name="y_true")
        y_pred_var = K.constant(y_pred, name="y_pred")
        result = loss(y_true_var, y_pred_var)
        return result.eval(session=session)
    elif K.backend() == "theano":
        y_true_var = K.constant(y_true, name="y_true")
        y_pred_var = K.constant(y_pred, name="y_pred")
        result = loss(y_true_var, y_pred_var)
        return result.eval()
    else:
        raise ValueError("Unsupported backend: %s" % K.backend())


Tim O'Donnell's avatar
Tim O'Donnell committed
def test_loss():
    for delta in [0.0, 0.3]:
Tim O'Donnell's avatar
Tim O'Donnell committed
        print("delta", delta)
        # Hit labels
        y_true = [
            1.0,
            0.0,
            1.0,
Tim O'Donnell's avatar
Tim O'Donnell committed
            -1.0,  # ignored
Tim O'Donnell's avatar
Tim O'Donnell committed
            1.0,
            0.0
        ]
        y_true = numpy.array(y_true)
        y_pred = [
            [0.3, 0.7, 0.5],
            [0.2, 0.4, 0.6],
            [0.1, 0.5, 0.3],
Tim O'Donnell's avatar
Tim O'Donnell committed
            [0.9, 0.1, 0.2],
Tim O'Donnell's avatar
Tim O'Donnell committed
            [0.1, 0.7, 0.1],
            [0.8, 0.2, 0.4],
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        ]
        y_pred = numpy.array(y_pred)

        # reference implementation 1

        def smooth_max(x, alpha):
            x = numpy.array(x)
            alpha = numpy.array([alpha])
            return (x * numpy.exp(x * alpha)).sum() / (
                numpy.exp(x * alpha)).sum()

        contributions = []
        for i in range(len(y_true)):
            if y_true[i] == 1.0:
                for j in range(len(y_true)):
                    if y_true[j] == 0.0:
                        tightest_i = max(y_pred[i])
                        contribution = sum(
                            max(0, y_pred[j, k] - tightest_i + delta)**2
                            for k in range(y_pred.shape[1])
                        )
                        contributions.append(contribution)
        contributions = numpy.array(contributions)
Tim O'Donnell's avatar
Tim O'Donnell committed
        expected1 = contributions.sum() / len(contributions)
Tim O'Donnell's avatar
Tim O'Donnell committed

        # reference implementation 2: numpy
        pos = numpy.array([
            max(y_pred[i])
            for i in range(len(y_pred))
            if y_true[i] == 1.0
        ])

Tim O'Donnell's avatar
Tim O'Donnell committed
        neg = y_pred[(y_true == 0.0).astype(bool)]
Tim O'Donnell's avatar
Tim O'Donnell committed
        expected2 = (
Tim O'Donnell's avatar
Tim O'Donnell committed
                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
            len(pos) * len(neg))
Tim O'Donnell's avatar
Tim O'Donnell committed

        yield numpy.testing.assert_almost_equal, expected1, expected2, 4

        computed = evaluate_loss(
            MultiallelicMassSpecLoss(delta=delta).loss,
            y_true,
            y_pred.reshape(y_pred.shape + (1,)))

        yield numpy.testing.assert_almost_equal, computed, expected1, 4
Tim O'Donnell's avatar
Tim O'Donnell committed


AA_DIST = pandas.Series(
    dict((line.split()[0], float(line.split()[1])) for line in """
A    0.071732
E    0.060102
N    0.034679
D    0.039601
T    0.055313
L    0.115337
V    0.070498
S    0.071882
Q    0.040436
F    0.050178
G    0.053176
C    0.005429
H    0.025487
I    0.056312
W    0.013593
K    0.057832
M    0.021079
Y    0.043372
R    0.060330
P    0.053632
""".strip().split("\n")))
print(AA_DIST)


def make_random_peptides(num_peptides_per_length=10000, lengths=[9]):
    peptides = []
    for length in lengths:
        peptides.extend(
            random_peptides
                (num_peptides_per_length, length=length, distribution=AA_DIST))
    return EncodableSequences.create(peptides)


def make_motif(allele, peptides, frac=0.01):
    peptides = EncodableSequences.create(peptides)
    predictions = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.predict(
        peptides=peptides,
        allele=allele,
    )
    random_predictions_df = pandas.DataFrame({"peptide": peptides.sequences})
    random_predictions_df["prediction"] = predictions
    random_predictions_df = random_predictions_df.sort_values(
        "prediction", ascending=True)
    top = random_predictions_df.iloc[:int(len(random_predictions_df) * frac)]
    matrix = positional_frequency_matrix(top.peptide.values)
    return matrix


Tim O'Donnell's avatar
Tim O'Donnell committed
def test_real_data_multiallelic_refinement(max_epochs=10):
Tim O'Donnell's avatar
Tim O'Donnell committed
    ms_df = pandas.read_csv(
        get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
    ms_df = ms_df.loc[
        (ms_df.mhc_class == "I") & (~ms_df.protein_ensembl.isnull())].copy()

    sample_table = ms_df.drop_duplicates(
        "sample_id").set_index("sample_id").loc[ms_df.sample_id.unique()]
    grouped = ms_df.groupby("sample_id").nunique()
    for col in sample_table.columns:
        if (grouped[col] > 1).any():
            del sample_table[col]
    sample_table["alleles"] = sample_table.hla.str.split()

Tim O'Donnell's avatar
Tim O'Donnell committed
    multi_train_hit_df = ms_df.loc[
Tim O'Donnell's avatar
Tim O'Donnell committed
        ms_df.sample_id  == "RA957"
    ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
Tim O'Donnell's avatar
Tim O'Donnell committed
    multi_train_hit_df["label"] = 1.0

    multi_train_decoy_df = ms_df.loc[
        (ms_df.sample_id  == "CD165") &
        (~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
    ].drop_duplicates("peptide")[["peptide"]]
    (multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
    multi_train_decoy_df["label"] = 0.0

    multi_train_df = pandas.concat(
        [multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
Tim O'Donnell's avatar
Tim O'Donnell committed
    multi_train_df["is_affinity"] = False

    multi_train_alleles = set()
    for alleles in sample_table.loc[multi_train_df.sample_id.unique()].alleles:
        multi_train_alleles.update(alleles)
    multi_train_alleles = sorted(multi_train_alleles)

    pan_train_df = pandas.read_csv(
        get_path(
            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))

    pan_sub_train_df = pan_train_df.loc[
        pan_train_df.allele.isin(multi_train_alleles),
        ["peptide", "allele", "measurement_inequality", "measurement_value"]
    ]
    pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
    del pan_sub_train_df["measurement_value"]
    pan_sub_train_df["is_affinity"] = True

    pan_predictor = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.with_mass_spec"),
        optimization_level=0,
        max_models=1)

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=multi_train_df.sample_id.values,
        experiment_to_allele_list=sample_table.alleles.to_dict(),
        max_alleles_per_experiment=sample_table.alleles.str.len().max(),
        allele_to_sequence=pan_predictor.allele_to_sequence,
    )
    allele_encoding.append_alleles(pan_sub_train_df.allele.values)
    allele_encoding =  allele_encoding.compact()

    combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df])

Tim O'Donnell's avatar
Tim O'Donnell committed
    presentation_predictor = Class1PresentationNeuralNetwork(
Tim O'Donnell's avatar
Tim O'Donnell committed
        pan_predictor,
Tim O'Donnell's avatar
Tim O'Donnell committed
        auxiliary_input_features=[],
Tim O'Donnell's avatar
Tim O'Donnell committed
        max_ensemble_size=1,
Tim O'Donnell's avatar
Tim O'Donnell committed
        max_epochs=max_epochs,
Tim O'Donnell's avatar
Tim O'Donnell committed
        learning_rate=0.0001,
        patience=5,
Tim O'Donnell's avatar
Tim O'Donnell committed
        min_delta=0.0,
        random_negative_rate=1.0)
Tim O'Donnell's avatar
Tim O'Donnell committed

    pre_predictions = from_ic50(
Tim O'Donnell's avatar
Tim O'Donnell committed
        presentation_predictor.predict(
Tim O'Donnell's avatar
Tim O'Donnell committed
            output="affinities",
            peptides=combined_train_df.peptide.values,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles=allele_encoding))
Tim O'Donnell's avatar
Tim O'Donnell committed

    (model,) = pan_predictor.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
        model.predict(
            peptides=numpy.repeat(combined_train_df.peptide.values, len(alleles)),
            allele_encoding=allele_encoding.allele_encoding,
    )).reshape((-1, len(alleles)))[:,0]

    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)

    motifs_history = []
    random_peptides_encodable = make_random_peptides(10000, [9])


    def update_motifs():
        for allele in multi_train_alleles:
            motif = make_motif(allele, random_peptides_encodable)
            motifs_history.append((allele, motif))

    print("Pre fitting:")
    update_motifs()
    print("Fitting...")

Tim O'Donnell's avatar
Tim O'Donnell committed
    presentation_predictor.fit(
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides=combined_train_df.peptide.values,
        labels=combined_train_df.label.values,
        allele_encoding=allele_encoding,
        affinities_mask=combined_train_df.is_affinity.values,
        inequalities=combined_train_df.measurement_inequality.values,
        progress_callback=update_motifs,
    )

Tim O'Donnell's avatar
Tim O'Donnell committed
    import ipdb ; ipdb.set_trace()


Tim O'Donnell's avatar
Tim O'Donnell committed
def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
Tim O'Donnell's avatar
Tim O'Donnell committed
    refine_allele = "HLA-C*01:02"
    alleles = [
        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
        "HLA-A*03:01", "HLA-B*15:01", refine_allele
    ]
    peptides_per_allele = [
        2000, 1000, 500,
        1500, 1200, 800,
    ]

    allele_to_peptides = dict(zip(alleles, peptides_per_allele))

    length = 9

    train_with_ms = pandas.read_csv(
        get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
    train_no_ms = pandas.read_csv(get_path("data_curated",
        "curated_training_data.no_mass_spec.csv.bz2"))

    def filter_df(df):
        df = df.loc[
            (df.allele.isin(alleles)) &
            (df.peptide.str.len() == length)
        ]
        return df

    train_with_ms = filter_df(train_with_ms)
    train_no_ms = filter_df(train_no_ms)

    ms_specific = train_with_ms.loc[
        ~train_with_ms.peptide.isin(train_no_ms.peptide)
    ]

    train_peptides = []
    train_true_alleles = []
    for allele in alleles:
        peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
            n=allele_to_peptides[allele])
        train_peptides.extend(peptides)
        train_true_alleles.extend([allele] * len(peptides))

    hits_df = pandas.DataFrame({"peptide": train_peptides})
    hits_df["true_allele"] = train_true_alleles
    hits_df["hit"] = 1.0

    decoys_df = hits_df.copy()
    decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
    decoys_df["true_allele"] = ""
    decoys_df["hit"] = 0.0

    mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
    mms_train_df["label"] =  mms_train_df.hit
    mms_train_df["is_affinity"] = False

    affinity_train_df = pandas.read_csv(
        get_path(
            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
    affinity_train_df = affinity_train_df.loc[
        affinity_train_df.allele.isin(alleles),
        ["peptide", "allele",  "measurement_inequality", "measurement_value"]]

    affinity_train_df["label"] = affinity_train_df["measurement_value"]
    del affinity_train_df["measurement_value"]
    affinity_train_df["is_affinity"] = True

Tim O'Donnell's avatar
Tim O'Donnell committed
    predictor = Class1PresentationNeuralNetwork(
Tim O'Donnell's avatar
Tim O'Donnell committed
        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
        auxiliary_input_features=["gene"],
        max_ensemble_size=1,
        max_epochs=max_epochs,
        learning_rate=0.0001,
        patience=5,
        min_delta=0.0,
        random_negative_rate=1.0,
        random_negative_constant=25)

    mms_allele_encoding = MultipleAlleleEncoding(
        experiment_names=["experiment1"] * len(mms_train_df),
        experiment_to_allele_list={
            "experiment1": alleles,
        },
        max_alleles_per_experiment=6,
        allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
    )
    allele_encoding = copy.deepcopy(mms_allele_encoding)
    allele_encoding.append_alleles(affinity_train_df.allele.values)
    allele_encoding = allele_encoding.compact()

    train_df = pandas.concat(
        [mms_train_df, affinity_train_df], ignore_index=True, sort=False)

    pre_predictions = from_ic50(
        predictor.predict(
            output="affinities_matrix",
            peptides=mms_train_df.peptide.values,
            alleles=mms_allele_encoding))

    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
        model.predict(
            peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
            allele_encoding=mms_allele_encoding.allele_encoding,
    )).reshape((-1, len(alleles)))

Tim O'Donnell's avatar
Tim O'Donnell committed
    #import ipdb ; ipdb.set_trace()

Tim O'Donnell's avatar
Tim O'Donnell committed
    mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
    pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
    print("PRE_AUC", pre_auc)

    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)

    motifs_history = []
    random_peptides_encodable = make_random_peptides(10000, [9])


    def update_motifs():
        for allele in alleles:
            motif = make_motif(allele, random_peptides_encodable)
            motifs_history.append((allele, motif))

    metric_rows = []

    def progress():
Tim O'Donnell's avatar
Tim O'Donnell committed
        (_, presentation_prediction, affinities_predictions) = (
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictor.predict(
                output="all",
                peptides=mms_train_df.peptide.values,
                alleles=mms_allele_encoding))
        affinities_predictions = from_ic50(affinities_predictions)
        for (kind, predictions) in [
                ("affinities", affinities_predictions),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("presentation", presentation_prediction)]:
Tim O'Donnell's avatar
Tim O'Donnell committed

            mms_train_df["max_prediction"] = predictions.max(1)
            mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
                predictions.argmax(1).flatten()
            ].values

            print(kind)
            print(predictions)

            mean_predictions_for_hit = mms_train_df.loc[
                mms_train_df.hit == 1.0
            ].max_prediction.mean()
            mean_predictions_for_decoy = mms_train_df.loc[
                mms_train_df.hit == 0.0
            ].max_prediction.mean()
            correct_allele_fraction = (
                    mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
                    mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
            ).mean()
            auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)

            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
            print(kind, "AUC", auc)

            metric_rows.append((
                kind,
                mean_predictions_for_hit,
                mean_predictions_for_decoy,
                correct_allele_fraction,
                auc,
            ))

            update_motifs()

Tim O'Donnell's avatar
Tim O'Donnell committed
        return (presentation_prediction, auc)
Tim O'Donnell's avatar
Tim O'Donnell committed

    print("Pre fitting:")
    progress()
    update_motifs()
    print("Fitting...")
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    predictor.fit(
        peptides=train_df.peptide.values,
        labels=train_df.label.values,
        inequalities=train_df.measurement_inequality.values,
        affinities_mask=train_df.is_affinity.values,
        allele_encoding=allele_encoding,
        progress_callback=progress,
    )
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    (predictions, final_auc) = progress()
    print("Final AUC", final_auc)

    update_motifs()

    motifs = pandas.DataFrame(
        motifs_history,
        columns=[
            "allele",
            "motif",
        ]
    )

    metrics = pandas.DataFrame(
        metric_rows,
        columns=[
            "output",
            "mean_predictions_for_hit",
            "mean_predictions_for_decoy",
            "correct_allele_fraction",
            "auc"
        ])

    return (predictor, predictions, metrics, motifs)
Tim O'Donnell's avatar
Tim O'Donnell committed
def test_synthetic_allele_refinement(max_epochs=10):
    refine_allele = "HLA-C*01:02"
    alleles = [
        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
        "HLA-A*03:01", "HLA-B*15:01", refine_allele
    ]
    peptides_per_allele = [
        2000, 1000, 500,
        1500, 1200, 800,
    ]

    allele_to_peptides = dict(zip(alleles, peptides_per_allele))

    length = 9

    train_with_ms = pandas.read_csv(
        get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
    train_no_ms = pandas.read_csv(get_path("data_curated",
        "curated_training_data.no_mass_spec.csv.bz2"))

    def filter_df(df):
        df = df.loc[
            (df.allele.isin(alleles)) &
            (df.peptide.str.len() == length)
        ]
        return df

    train_with_ms = filter_df(train_with_ms)
    train_no_ms = filter_df(train_no_ms)

    ms_specific = train_with_ms.loc[
        ~train_with_ms.peptide.isin(train_no_ms.peptide)
    ]

    train_peptides = []
    train_true_alleles = []
    for allele in alleles:
        peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
            n=allele_to_peptides[allele])
        train_peptides.extend(peptides)
        train_true_alleles.extend([allele] * len(peptides))

    hits_df = pandas.DataFrame({"peptide": train_peptides})
    hits_df["true_allele"] = train_true_alleles
    hits_df["hit"] = 1.0

    decoys_df = hits_df.copy()
    decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
    decoys_df["true_allele"] = ""
    decoys_df["hit"] = 0.0

    train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)

Tim O'Donnell's avatar
Tim O'Donnell committed
    predictor = Class1PresentationNeuralNetwork(
Tim O'Donnell's avatar
Tim O'Donnell committed
        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
        max_ensemble_size=1,
Tim O'Donnell's avatar
Tim O'Donnell committed
        max_epochs=max_epochs,
        learning_rate=0.0001,
Tim O'Donnell's avatar
Tim O'Donnell committed
        patience=5,
Tim O'Donnell's avatar
Tim O'Donnell committed
        min_delta=0.0,
        random_negative_rate=0.0,
        random_negative_constant=0)
Tim O'Donnell's avatar
Tim O'Donnell committed

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=["experiment1"] * len(train_df),
        experiment_to_allele_list={
            "experiment1": alleles,
        },
        max_alleles_per_experiment=6,
        allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
    ).compact()

Tim O'Donnell's avatar
Tim O'Donnell committed
    pre_predictions = from_ic50(
        predictor.predict(
Tim O'Donnell's avatar
Tim O'Donnell committed
            output="affinities_matrix",
Tim O'Donnell's avatar
Tim O'Donnell committed
            peptides=train_df.peptide.values,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles=allele_encoding))
Tim O'Donnell's avatar
Tim O'Donnell committed

    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
        model.predict(
            peptides=numpy.repeat(train_df.peptide.values, len(alleles)),
            allele_encoding=allele_encoding.allele_encoding,
    )).reshape((-1, len(alleles)))

Tim O'Donnell's avatar
Tim O'Donnell committed
    #import ipdb ; ipdb.set_trace()

Tim O'Donnell's avatar
Tim O'Donnell committed
    train_df["pre_max_prediction"] = pre_predictions.max(1)
    pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values)
    print("PRE_AUC", pre_auc)

Tim O'Donnell's avatar
Tim O'Donnell committed
    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    motifs_history = []
    random_peptides_encodable = make_random_peptides(10000, [9])
Tim O'Donnell's avatar
Tim O'Donnell committed
    def update_motifs():
        for allele in alleles:
            motif = make_motif(allele, random_peptides_encodable)
            motifs_history.append((allele, motif))
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    metric_rows = []
Tim O'Donnell's avatar
Tim O'Donnell committed
    def progress():
Tim O'Donnell's avatar
Tim O'Donnell committed
        (_, presentation_prediction, affinities_predictions) = (
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictor.predict(
Tim O'Donnell's avatar
Tim O'Donnell committed
                output="all",
Tim O'Donnell's avatar
Tim O'Donnell committed
                peptides=train_df.peptide.values,
Tim O'Donnell's avatar
Tim O'Donnell committed
                alleles=allele_encoding))
        affinities_predictions = from_ic50(affinities_predictions)
        for (kind, predictions) in [
                ("affinities", affinities_predictions),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("presentation", presentation_prediction)]:
Tim O'Donnell's avatar
Tim O'Donnell committed

            train_df["max_prediction"] = predictions.max(1)
            train_df["predicted_allele"] = pandas.Series(alleles).loc[
                predictions.argmax(1).flatten()
            ].values

            print(kind)
            print(predictions)

            mean_predictions_for_hit = train_df.loc[
                train_df.hit == 1.0
            ].max_prediction.mean()
            mean_predictions_for_decoy = train_df.loc[
                train_df.hit == 0.0
            ].max_prediction.mean()
            correct_allele_fraction = (
                    train_df.loc[train_df.hit == 1.0].predicted_allele ==
                    train_df.loc[train_df.hit == 1.0].true_allele
            ).mean()
            auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)

            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
            print(kind, "AUC", auc)

            metric_rows.append((
                kind,
                mean_predictions_for_hit,
                mean_predictions_for_decoy,
                correct_allele_fraction,
                auc,
            ))

            update_motifs()

Tim O'Donnell's avatar
Tim O'Donnell committed
        return (presentation_prediction, auc)
Tim O'Donnell's avatar
Tim O'Donnell committed

    print("Pre fitting:")
    progress()
    update_motifs()
    print("Fitting...")

    predictor.fit(
        peptides=train_df.peptide.values,
        labels=train_df.hit.values,
        allele_encoding=allele_encoding,
        progress_callback=progress,
    )

    (predictions, final_auc) = progress()
    print("Final AUC", final_auc)

    update_motifs()

    motifs = pandas.DataFrame(
        motifs_history,
        columns=[
            "allele",
            "motif",
Tim O'Donnell's avatar
Tim O'Donnell committed
    metrics = pandas.DataFrame(
        metric_rows,
        columns=[
Tim O'Donnell's avatar
Tim O'Donnell committed
            "output",
Tim O'Donnell's avatar
Tim O'Donnell committed
            "mean_predictions_for_hit",
            "mean_predictions_for_decoy",
            "correct_allele_fraction",
            "auc"
        ])
Tim O'Donnell's avatar
Tim O'Donnell committed
    return (predictor, predictions, metrics, motifs)
def test_batch_generator(sample_rate=0.1):
Tim O'Donnell's avatar
Tim O'Donnell committed
    multi_train_df = pandas.read_csv(
        data_path("multiallelic_ms.benchmark1.csv.bz2"))
    multi_train_df["label"] = multi_train_df.hit
    multi_train_df["is_affinity"] = False

    sample_table = multi_train_df.loc[
        multi_train_df.label == True
    ].drop_duplicates("sample_id").set_index("sample_id").loc[
        multi_train_df.sample_id.unique()
    ]
    grouped = multi_train_df.groupby("sample_id").nunique()
    for col in sample_table.columns:
        if (grouped[col] > 1).any():
            del sample_table[col]
    sample_table["alleles"] = sample_table.hla.str.split()

    pan_train_df = pandas.read_csv(
        get_path(
            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
    pan_sub_train_df = pan_train_df
    pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
    del pan_sub_train_df["measurement_value"]
    pan_sub_train_df["is_affinity"] = True

    pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate)
    multi_train_df = multi_train_df.sample(frac=sample_rate)

    pan_predictor = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.with_mass_spec"),
        optimization_level=0,
        max_models=1)

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=multi_train_df.sample_id.values,
        experiment_to_allele_list=sample_table.alleles.to_dict(),
        max_alleles_per_experiment=sample_table.alleles.str.len().max(),
        allele_to_sequence=pan_predictor.allele_to_sequence,
    )
    allele_encoding.append_alleles(pan_sub_train_df.allele.values)
    allele_encoding = allele_encoding.compact()

    combined_train_df = pandas.concat(
        [multi_train_df, pan_sub_train_df], ignore_index=True, sort=True)

Tim O'Donnell's avatar
Tim O'Donnell committed
    presentation_predictor = Class1PresentationNeuralNetwork(
Tim O'Donnell's avatar
Tim O'Donnell committed
        pan_predictor,
        auxiliary_input_features=[],
        max_ensemble_size=1,
        max_epochs=0,
        batch_generator_batch_size=128,
        learning_rate=0.0001,
        patience=5,
        min_delta=0.0,
        random_negative_rate=1.0)

Tim O'Donnell's avatar
Tim O'Donnell committed
    fit_results = presentation_predictor.fit(
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides=combined_train_df.peptide.values,
        labels=combined_train_df.label.values,
        allele_encoding=allele_encoding,
        affinities_mask=combined_train_df.is_affinity.values,
        inequalities=combined_train_df.measurement_inequality.values,
    )

    batch_generator = fit_results['batch_generator']
    train_batch_plan = batch_generator.train_batch_plan

    assert_greater(len(train_batch_plan.equivalence_class_labels), 100)
    assert_less(len(train_batch_plan.equivalence_class_labels), 1000)


parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
Tim O'Donnell's avatar
Tim O'Donnell committed
    "--out-metrics-csv",
    default=None,
    help="Metrics output")
parser.add_argument(
    "--out-motifs-pickle",
    default=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
    help="Metrics output")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--max-epochs",
    default=100,
    type=int,
    help="Max epochs")
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed


if __name__ == '__main__':
    # If run directly from python, leave the user in a shell to explore results.
    setup()
    args = parser.parse_args(sys.argv[1:])
Tim O'Donnell's avatar
Tim O'Donnell committed
    (predictor, predictions, metrics, motifs) = (
        test_synthetic_allele_refinement(max_epochs=args.max_epochs))
Tim O'Donnell's avatar
Tim O'Donnell committed

    if args.out_metrics_csv:
        metrics.to_csv(args.out_metrics_csv)
    if args.out_motifs_pickle:
        motifs.to_pickle(args.out_motifs_pickle)

    # Leave in ipython
    import ipdb  # pylint: disable=import-error
    ipdb.set_trace()