Skip to content
Snippets Groups Projects
Commit d6999996 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working

parent 9c25aa29
No related merge requests found
......@@ -116,13 +116,12 @@ class MultiallelicMassSpecBatchGenerator(object):
def plan_from_dataframe(df, hyperparameters):
affinity_fraction = hyperparameters["batch_generator_affinity_fraction"]
batch_size = hyperparameters["batch_generator_batch_size"]
classes = {}
df["equivalence_class"] = [
classes.setdefault(
tuple(row[["is_affinity", "is_binder", "experiment_name"]]),
len(classes))
for _, row in df.iterrows()
]
equivalence_columns = ["is_affinity", "is_binder", "experiment_name"]
df["equivalence_key"] = df[equivalence_columns].astype(str).sum(1)
equivalence_map = dict(
(v, i)
for (i, v) in zip(*df.equivalence_key.factorize()))
df["equivalence_class"] = df.equivalence_key.map(equivalence_map)
df["first_allele"] = df.alleles.str.get(0)
df["unused"] = True
df["idx"] = df.index
......
......@@ -725,7 +725,8 @@ class Class1LigandomePredictor(object):
axis=0)
if existing_weights_shape != reshaped.shape:
print("Performing network surgery", existing_weights_shape, reshaped.shape)
print(
"Performing network surgery", existing_weights_shape, reshaped.shape)
# Network surgery required. Make a new network with this layer's
# dimensions changed. Kind of a hack.
layer.input_dim = reshaped.shape[0]
......
import logging
logging.getLogger('matplotlib').disabled = True
logging.getLogger('tensorflow').disabled = True
import os
import collections
import time
import cProfile
import pstats
import pandas
import numpy
from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.downloads import get_path
from mhcflurry.batch_generator import (
MultiallelicMassSpecBatchGenerator)
from mhcflurry.regression_target import to_ic50
from mhcflurry import Class1AffinityPredictor
from numpy.testing import assert_equal
def data_path(name):
'''
Return the absolute path to a file in the test/data directory.
The name specified should be relative to test/data.
'''
return os.path.join(os.path.dirname(__file__), "data", name)
def test_basic():
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
......@@ -64,3 +86,95 @@ def test_basic():
#import ipdb;ipdb.set_trace()
def test_large(sample_rate=0.01):
multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit
multi_train_df["is_affinity"] = False
sample_table = multi_train_df.loc[
multi_train_df.label == True
].drop_duplicates("sample_id").set_index("sample_id").loc[
multi_train_df.sample_id.unique()
]
grouped = multi_train_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
pan_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
pan_sub_train_df = pan_train_df
pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
del pan_sub_train_df["measurement_value"]
pan_sub_train_df["is_affinity"] = True
pan_sub_train_df = pan_sub_train_df.sample(frac=sample_rate)
multi_train_df = multi_train_df.sample(frac=sample_rate)
pan_predictor = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.with_mass_spec"),
optimization_level=0,
max_models=1)
allele_encoding = MultipleAlleleEncoding(
experiment_names=multi_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(),
max_alleles_per_experiment=sample_table.alleles.str.len().max(),
allele_to_sequence=pan_predictor.allele_to_sequence,
)
allele_encoding.append_alleles(pan_sub_train_df.allele.values)
allele_encoding = allele_encoding.compact()
combined_train_df = pandas.concat(
[multi_train_df, pan_sub_train_df], ignore_index=True, sort=True)
print("Total size", combined_train_df)
planner = MultiallelicMassSpecBatchGenerator(
hyperparameters=dict(
batch_generator_validation_split=0.2,
batch_generator_batch_size=1024,
batch_generator_affinity_fraction=0.5))
s = time.time()
profiler = cProfile.Profile()
profiler.enable()
planner.plan(
affinities_mask=combined_train_df.is_affinity.values,
experiment_names=combined_train_df.sample_id.values,
alleles_matrix=allele_encoding.alleles,
is_binder=numpy.where(
combined_train_df.is_affinity.values,
combined_train_df.label.values,
to_ic50(combined_train_df.label.values)) < 1000.0)
stats = pstats.Stats(profiler)
stats.sort_stats("cumtime").reverse_order().print_stats()
print(planner.summary())
print("Planning took [sec]: ", time.time() - s)
(train_iter, test_iter) = planner.get_train_and_test_generators(
x_dict={
"idx": numpy.arange(len(combined_train_df)),
},
y_list=[])
for (kind, it) in [("train", train_iter), ("test", test_iter)]:
for (i, (x_item, y_item)) in enumerate(it):
idx = x_item["idx"]
combined_train_df.loc[idx, "kind"] = kind
combined_train_df.loc[idx, "idx"] = idx
combined_train_df.loc[idx, "batch"] = i
combined_train_df["idx"] = combined_train_df.idx.astype(int)
combined_train_df["batch"] = combined_train_df.batch.astype(int)
for ((kind, batch_num), batch_df) in combined_train_df.groupby(["kind", "batch"]):
if not batch_df.is_affinity.all():
# Test each batch has at most one multiallelic ms experiment.
assert_equal(
batch_df.loc[
~batch_df.is_affinity
].sample_id.nunique(), 1)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment