test_batch_generator.py

import logging
logging.getLogger('matplotlib').disabled = True
logging.getLogger('tensorflow').disabled = True

import os
import collections
import time
import cProfile
import pstats

import pandas
import numpy

from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.downloads import get_path
from mhcflurry.batch_generator import (
    MultiallelicMassSpecBatchGenerator)
from mhcflurry.regression_target import to_ic50
from mhcflurry import Class1AffinityPredictor

from numpy.testing import assert_equal


def data_path(name):
    '''
    Return the absolute path to a file in the test/data directory.
    The name specified should be relative to test/data.
    '''
    return os.path.join(os.path.dirname(__file__), "data", name)


def test_basic():
    planner = MultiallelicMassSpecBatchGenerator(
        hyperparameters=dict(
            batch_generator_validation_split=0.2,
            batch_generator_batch_size=10,
            batch_generator_affinity_fraction=0.5))

    exp1_alleles = ["HLA-A*03:01", "HLA-B*07:02", "HLA-C*02:01"]
    exp2_alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*02:01"]

    df = pandas.DataFrame(dict(
        affinities_mask=([True] * 4) + ([False] * 6),
        experiment_names=([None] * 4) + (["exp1"] * 2) + (["exp2"] * 4),
        alleles_matrix=[
            ["HLA-A*02:01", None, None],
            ["HLA-A*02:01", None, None],
            ["HLA-A*03:01", None, None],
            ["HLA-A*03:01", None, None],
            exp1_alleles,
            exp1_alleles,
            exp2_alleles,
            exp2_alleles,
            exp2_alleles,
            exp2_alleles,
        ],
        is_binder=[
            True, True, False, False, True, False, True, False, True, False,
        ]))
    planner.plan(**df.to_dict("list"))
    print(planner.summary())

    (train_iter, test_iter) = planner.get_train_and_test_generators(
        x_dict={
            "idx": numpy.arange(len(df)),
        },
        y_list=[])

    for (kind, it) in [("train", train_iter), ("test", test_iter)]:
        for (i, (x_item, y_item)) in enumerate(it):
            idx = x_item["idx"]
            df.loc[idx, "kind"] = kind
            df.loc[idx, "idx"] = idx
            df.loc[idx, "batch"] = i
    df["idx"] = df.idx.astype(int)
    df["batch"] = df.batch.astype(int)
    print(df)

    for ((kind, batch_num), batch_df) in df.groupby(["kind", "batch"]):
        if not batch_df.affinities_mask.all():
            # Test each batch has at most one multiallelic ms experiment.
            assert_equal(
                batch_df.loc[
                    ~batch_df.affinities_mask
                ].experiment_names.nunique(), 1)

    #import ipdb;ipdb.set_trace()


def test_large(sample_rate=0.01):
    multi_train_df = pandas.read_csv(
        data_path("multiallelic_ms.benchmark1.csv.bz2"))
    multi_train_df["label"] = multi_train_df.hit
    multi_train_df["is_affinity"] = False

    sample_table = multi_train_df.loc[
        multi_train_df.label == True
    ].drop_duplicates("sample_id").set_index("sample_id").loc[
        multi_train_df.sample_id.unique()
    ]
    grouped = multi_train_df.groupby("sample_id").nunique()
    for col in sample_table.columns:
        if (grouped[col] > 1).any():
            del sample_table[col]
    sample_table["alleles"] = sample_table.hla.str.split()

    pan_train_df = pandas.read_csv(
        get_path(
            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
    pan_sub_train_df = pan_train_df
    pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
    del pan_sub_train_df["measurement_value"]
    pan_sub_train_df["is_affinity"] = True

    pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate)
    multi_train_df = multi_train_df.sample(frac=sample_rate)

    pan_predictor = Class1AffinityPredictor.load(
        get_path("models_class1_pan", "models.with_mass_spec"),
        optimization_level=0,
        max_models=1)

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=multi_train_df.sample_id.values,
        experiment_to_allele_list=sample_table.alleles.to_dict(),
        max_alleles_per_experiment=sample_table.alleles.str.len().max(),
        allele_to_sequence=pan_predictor.allele_to_sequence,
    )
    allele_encoding.append_alleles(pan_sub_train_df.allele.values)
    allele_encoding = allele_encoding.compact()

    combined_train_df = pandas.concat(
        [multi_train_df, pan_sub_train_df], ignore_index=True, sort=True)

    print("Total size", combined_train_df)

    planner = MultiallelicMassSpecBatchGenerator(
        hyperparameters=dict(
            batch_generator_validation_split=0.2,
            batch_generator_batch_size=128,
            batch_generator_affinity_fraction=0.5))

    s = time.time()
    profiler = cProfile.Profile()
    profiler.enable()
    planner.plan(
        affinities_mask=combined_train_df.is_affinity.values,
        experiment_names=combined_train_df.sample_id.values,
        alleles_matrix=allele_encoding.alleles,
        is_binder=numpy.where(
            combined_train_df.is_affinity.values,
            combined_train_df.label.values,
            to_ic50(combined_train_df.label.values)) < 1000.0)
    stats = pstats.Stats(profiler)
    stats.sort_stats("cumtime").reverse_order().print_stats()
    print(planner.summary())
    print("Planning took [sec]: ", time.time() - s)

    (train_iter, test_iter) = planner.get_train_and_test_generators(
        x_dict={
            "idx": numpy.arange(len(combined_train_df)),
        },
        y_list=[])

    for (kind, it) in [("train", train_iter), ("test", test_iter)]:
        for (i, (x_item, y_item)) in enumerate(it):
            idx = x_item["idx"]
            combined_train_df.loc[idx, "kind"] = kind
            combined_train_df.loc[idx, "idx"] = idx
            combined_train_df.loc[idx, "batch"] = i
    import ipdb ; ipdb.set_trace()
    combined_train_df["idx"] = combined_train_df.idx.astype(int)
    combined_train_df["batch"] = combined_train_df.batch.astype(int)

    for ((kind, batch_num), batch_df) in combined_train_df.groupby(["kind", "batch"]):
        if not batch_df.is_affinity.all():
            # Test each batch has at most one multiallelic ms experiment.
            assert_equal(
                batch_df.loc[
                    ~batch_df.is_affinity
                ].sample_id.nunique(), 1)