Skip to content
Snippets Groups Projects
Commit d47ba9ca authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 06b6ecc6
No related merge requests found
...@@ -10,7 +10,6 @@ import pandas ...@@ -10,7 +10,6 @@ import pandas
from .hyperparameters import HyperparameterDefaults from .hyperparameters import HyperparameterDefaults
from .encodable_sequences import EncodableSequences, EncodingError from .encodable_sequences import EncodableSequences, EncodingError
from .amino_acid import available_vector_encodings, vector_encoding_length
from .regression_target import to_ic50, from_ic50 from .regression_target import to_ic50, from_ic50
from .common import random_peptides, amino_acid_distribution from .common import random_peptides, amino_acid_distribution
from .custom_loss import get_loss from .custom_loss import get_loss
...@@ -156,7 +155,6 @@ class Class1NeuralNetwork(object): ...@@ -156,7 +155,6 @@ class Class1NeuralNetwork(object):
hyperparameters[to_name] = value hyperparameters[to_name] = value
return hyperparameters return hyperparameters
def __init__(self, **hyperparameters): def __init__(self, **hyperparameters):
self.hyperparameters = self.hyperparameter_defaults.with_defaults( self.hyperparameters = self.hyperparameter_defaults.with_defaults(
self.apply_hyperparameter_renames(hyperparameters)) self.apply_hyperparameter_renames(hyperparameters))
...@@ -420,7 +418,6 @@ class Class1NeuralNetwork(object): ...@@ -420,7 +418,6 @@ class Class1NeuralNetwork(object):
allele_encoding.allele_representations( allele_encoding.allele_representations(
self.hyperparameters['allele_amino_acid_encoding'])) self.hyperparameters['allele_amino_acid_encoding']))
def fit_generator( def fit_generator(
self, self,
generator, generator,
...@@ -544,7 +541,6 @@ class Class1NeuralNetwork(object): ...@@ -544,7 +541,6 @@ class Class1NeuralNetwork(object):
fit_info["num_points"] = yielded_values_box[0] fit_info["num_points"] = yielded_values_box[0]
self.fit_info.append(dict(fit_info)) self.fit_info.append(dict(fit_info))
def fit( def fit(
self, self,
peptides, peptides,
......
...@@ -232,19 +232,31 @@ def run(argv=sys.argv[1:]): ...@@ -232,19 +232,31 @@ def run(argv=sys.argv[1:]):
chunksize=1) chunksize=1)
models_by_fold = {} models_by_fold = {}
summary_dfs = []
for result in tqdm.tqdm(results, total=len(work_items)): for result in tqdm.tqdm(results, total=len(work_items)):
pprint(result) pprint(result)
fold_num = result['fold_num'] fold_num = result['fold_num']
(all_models_for_fold, _) = folds_to_predictors[fold_num]
models = [ models = [
folds_to_predictors[fold_num][0][i] all_models_for_fold[i]
for i in result['selected_indices'] for i in result['selected_indices']
] ]
summary_df = result['summary'].copy()
summary_df.index = summary_df.index.map(
lambda idx: all_models_for_fold[idx])
summary_dfs.append(summary_df)
print("Selected %d models for fold %d: %s" % ( print("Selected %d models for fold %d: %s" % (
len(models), fold_num, result['selected_indices'])) len(models), fold_num, result['selected_indices']))
models_by_fold[fold_num] = models models_by_fold[fold_num] = models
for model in models: for model in models:
result_predictor.add_pan_allele_model(model) result_predictor.add_pan_allele_model(model)
summary_df = pandas.concat(summary_dfs, ignore_index=False)
summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config())
result_predictor.metadata_dataframes["model_selection_summary"] = (
summary_df.reset_index(drop=True))
result_predictor.save(args.out_models_dir) result_predictor.save(args.out_models_dir)
model_selection_time = time.time() - start model_selection_time = time.time() - start
...@@ -312,11 +324,16 @@ def model_select(fold_num, models, min_models, max_models): ...@@ -312,11 +324,16 @@ def model_select(fold_num, models, min_models, max_models):
break break
assert selected assert selected
summary_df = pandas.Series(individual_model_scores)[
numpy.arange(len(models))
].to_frame()
summary_df.columns = ['mse_score']
return { return {
'fold_num': fold_num, 'fold_num': fold_num,
'selected_indices': selected, 'selected_indices': selected,
'individual_model_scores': pandas.Series( 'summary': summary_df, # indexed by model index
individual_model_scores)[numpy.arange(len(models))],
} }
......
...@@ -15,8 +15,6 @@ from functools import partial ...@@ -15,8 +15,6 @@ from functools import partial
import numpy import numpy
import pandas import pandas
import yaml import yaml
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold
from mhcnames import normalize_allele_name from mhcnames import normalize_allele_name
import tqdm # progress bar import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
...@@ -28,11 +26,8 @@ from .parallelism import ( ...@@ -28,11 +26,8 @@ from .parallelism import (
add_worker_pool_args, add_worker_pool_args,
worker_pool_with_gpu_assignments_from_args, worker_pool_with_gpu_assignments_from_args,
call_wrapped_kwargs) call_wrapped_kwargs)
from .hyperparameters import HyperparameterDefaults
from .allele_encoding import AlleleEncoding from .allele_encoding import AlleleEncoding
from .encodable_sequences import EncodableSequences from .encodable_sequences import EncodableSequences
from .regression_target import to_ic50, from_ic50
from .import custom_loss
# To avoid pickling large matrices to send to child processes when running in # To avoid pickling large matrices to send to child processes when running in
...@@ -173,10 +168,6 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max): ...@@ -173,10 +168,6 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max):
print("Test points per fold") print("Test points per fold")
print((~result_df).sum()) print((~result_df).sum())
result_df["allele"] = df["allele"]
result_df["peptide"] = df["peptide"]
return result_df return result_df
...@@ -304,10 +295,13 @@ def main(args): ...@@ -304,10 +295,13 @@ def main(args):
predictor = Class1AffinityPredictor( predictor = Class1AffinityPredictor(
allele_to_sequence=allele_encoding.allele_to_sequence, allele_to_sequence=allele_encoding.allele_to_sequence,
metadata_dataframes={ metadata_dataframes={
'train_data': df, 'train_data': pandas.merge(
'training_folds': folds_df, df,
folds_df,
left_index=True,
right_index=True)
}) })
serial_run = args.num_jobs == 1 serial_run = args.num_jobs == 0
work_items = [] work_items = []
for (h, hyperparameters) in enumerate(hyperparameters_lst): for (h, hyperparameters) in enumerate(hyperparameters_lst):
...@@ -353,6 +347,7 @@ def main(args): ...@@ -353,6 +347,7 @@ def main(args):
if worker_pool: if worker_pool:
print("Processing %d work items in parallel." % len(work_items)) print("Processing %d work items in parallel." % len(work_items))
assert not serial_run
results_generator = worker_pool.imap_unordered( results_generator = worker_pool.imap_unordered(
partial(call_wrapped_kwargs, train_model), partial(call_wrapped_kwargs, train_model),
...@@ -389,6 +384,7 @@ def main(args): ...@@ -389,6 +384,7 @@ def main(args):
# which it adds models to, so no merging is required. It also saves # which it adds models to, so no merging is required. It also saves
# as it goes so no saving is required at the end. # as it goes so no saving is required at the end.
print("Processing %d work items in serial." % len(work_items)) print("Processing %d work items in serial." % len(work_items))
assert serial_run
for _ in tqdm.trange(len(work_items)): for _ in tqdm.trange(len(work_items)):
item = work_items.pop(0) # want to keep freeing up memory item = work_items.pop(0) # want to keep freeing up memory
work_predictor = train_model(**item) work_predictor = train_model(**item)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment