Skip to content
Snippets Groups Projects
Commit d47ba9ca authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 06b6ecc6
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,6 @@ import pandas
from .hyperparameters import HyperparameterDefaults
from .encodable_sequences import EncodableSequences, EncodingError
from .amino_acid import available_vector_encodings, vector_encoding_length
from .regression_target import to_ic50, from_ic50
from .common import random_peptides, amino_acid_distribution
from .custom_loss import get_loss
......@@ -156,7 +155,6 @@ class Class1NeuralNetwork(object):
hyperparameters[to_name] = value
return hyperparameters
def __init__(self, **hyperparameters):
self.hyperparameters = self.hyperparameter_defaults.with_defaults(
self.apply_hyperparameter_renames(hyperparameters))
......@@ -420,7 +418,6 @@ class Class1NeuralNetwork(object):
allele_encoding.allele_representations(
self.hyperparameters['allele_amino_acid_encoding']))
def fit_generator(
self,
generator,
......@@ -544,7 +541,6 @@ class Class1NeuralNetwork(object):
fit_info["num_points"] = yielded_values_box[0]
self.fit_info.append(dict(fit_info))
def fit(
self,
peptides,
......
......@@ -232,19 +232,31 @@ def run(argv=sys.argv[1:]):
chunksize=1)
models_by_fold = {}
summary_dfs = []
for result in tqdm.tqdm(results, total=len(work_items)):
pprint(result)
fold_num = result['fold_num']
(all_models_for_fold, _) = folds_to_predictors[fold_num]
models = [
folds_to_predictors[fold_num][0][i]
all_models_for_fold[i]
for i in result['selected_indices']
]
summary_df = result['summary'].copy()
summary_df.index = summary_df.index.map(
lambda idx: all_models_for_fold[idx])
summary_dfs.append(summary_df)
print("Selected %d models for fold %d: %s" % (
len(models), fold_num, result['selected_indices']))
models_by_fold[fold_num] = models
for model in models:
result_predictor.add_pan_allele_model(model)
summary_df = pandas.concat(summary_dfs, ignore_index=False)
summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config())
result_predictor.metadata_dataframes["model_selection_summary"] = (
summary_df.reset_index(drop=True))
result_predictor.save(args.out_models_dir)
model_selection_time = time.time() - start
......@@ -312,11 +324,16 @@ def model_select(fold_num, models, min_models, max_models):
break
assert selected
summary_df = pandas.Series(individual_model_scores)[
numpy.arange(len(models))
].to_frame()
summary_df.columns = ['mse_score']
return {
'fold_num': fold_num,
'selected_indices': selected,
'individual_model_scores': pandas.Series(
individual_model_scores)[numpy.arange(len(models))],
'summary': summary_df, # indexed by model index
}
......
......@@ -15,8 +15,6 @@ from functools import partial
import numpy
import pandas
import yaml
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold
from mhcnames import normalize_allele_name
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
......@@ -28,11 +26,8 @@ from .parallelism import (
add_worker_pool_args,
worker_pool_with_gpu_assignments_from_args,
call_wrapped_kwargs)
from .hyperparameters import HyperparameterDefaults
from .allele_encoding import AlleleEncoding
from .encodable_sequences import EncodableSequences
from .regression_target import to_ic50, from_ic50
from .import custom_loss
# To avoid pickling large matrices to send to child processes when running in
......@@ -173,10 +168,6 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max):
print("Test points per fold")
print((~result_df).sum())
result_df["allele"] = df["allele"]
result_df["peptide"] = df["peptide"]
return result_df
......@@ -304,10 +295,13 @@ def main(args):
predictor = Class1AffinityPredictor(
allele_to_sequence=allele_encoding.allele_to_sequence,
metadata_dataframes={
'train_data': df,
'training_folds': folds_df,
'train_data': pandas.merge(
df,
folds_df,
left_index=True,
right_index=True)
})
serial_run = args.num_jobs == 1
serial_run = args.num_jobs == 0
work_items = []
for (h, hyperparameters) in enumerate(hyperparameters_lst):
......@@ -353,6 +347,7 @@ def main(args):
if worker_pool:
print("Processing %d work items in parallel." % len(work_items))
assert not serial_run
results_generator = worker_pool.imap_unordered(
partial(call_wrapped_kwargs, train_model),
......@@ -389,6 +384,7 @@ def main(args):
# which it adds models to, so no merging is required. It also saves
# as it goes so no saving is required at the end.
print("Processing %d work items in serial." % len(work_items))
assert serial_run
for _ in tqdm.trange(len(work_items)):
item = work_items.pop(0) # want to keep freeing up memory
work_predictor = train_model(**item)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment