Skip to content
Snippets Groups Projects
Commit ab85ec98 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 4dbe94d4
No related merge requests found
......@@ -458,6 +458,9 @@ class Class1NeuralNetwork(object):
"""
import keras
from keras import backend as K
fit_info = collections.defaultdict(list)
loss = get_loss(self.hyperparameters['loss'])
......@@ -478,6 +481,13 @@ class Class1NeuralNetwork(object):
network._make_predict_function()
self.set_allele_representations(allele_representations)
if self.hyperparameters['learning_rate'] is not None:
K.set_value(
self.network().optimizer.lr,
self.hyperparameters['learning_rate'])
fit_info["learning_rate"] = float(
K.get_value(self.network().optimizer.lr))
validation_x_dict = {
'peptide': self.peptides_to_network_input(
validation_peptide_encoding),
......@@ -513,7 +523,8 @@ class Class1NeuralNetwork(object):
yielded_values_box[0] += len(affinities)
start = time.time()
result = network.fit_generator(
fit_history = network.fit_generator(
wrapped_generator(),
steps_per_epoch=steps_per_epoch,
epochs=epochs,
......@@ -526,10 +537,12 @@ class Class1NeuralNetwork(object):
patience=patience,
verbose=verbose)]
)
if verbose > 0:
print("fit_generator completed in %0.2f sec (%d total points)" % (
time.time() - start, yielded_values_box[0]))
return result
for (key, value) in fit_history.history.items():
fit_info[key].extend(value)
fit_info["time"] = time.time() - start
fit_info["num_points"] = yielded_values_box[0]
self.fit_info.append(dict(fit_info))
def fit(
......@@ -585,8 +598,10 @@ class Class1NeuralNetwork(object):
How often (in seconds) to print progress update. Set to None to
disable.
"""
from keras import backend as K
encodable_peptides = EncodableSequences.create(peptides)
peptide_encoding = self.peptides_to_network_input(encodable_peptides)
fit_info = collections.defaultdict(list)
length_counts = (
pandas.Series(encodable_peptides.sequences)
......@@ -687,10 +702,11 @@ class Class1NeuralNetwork(object):
loss=loss.loss, optimizer=self.hyperparameters['optimizer'])
if self.hyperparameters['learning_rate'] is not None:
from keras import backend as K
K.set_value(
self.network().optimizer.lr,
self.hyperparameters['learning_rate'])
fit_info["learning_rate"] = float(
K.get_value(self.network().optimizer.lr))
if loss.supports_inequalities:
# Do not sample negative affinities: just use an inequality.
......@@ -762,7 +778,6 @@ class Class1NeuralNetwork(object):
min_val_loss_iteration = None
min_val_loss = None
fit_info = collections.defaultdict(list)
start = time.time()
last_progress_print = None
x_dict_with_random_negatives = {}
......
......@@ -43,13 +43,6 @@ parser.add_argument(
help=(
"Model selection data CSV. Expected columns: "
"allele, peptide, measurement_value"))
parser.add_argument(
"--exclude-data",
metavar="FILE.csv",
required=False,
help=(
"Data to EXCLUDE from model selection. Useful to specify the original "
"training data used"))
parser.add_argument(
"--models-dir",
metavar="DIR",
......@@ -60,24 +53,6 @@ parser.add_argument(
metavar="DIR",
required=True,
help="Directory to write selected models")
parser.add_argument(
"--out-unselected-predictions",
metavar="FILE.csv",
help="Write predictions for validation data using unselected predictor to "
"FILE.csv")
parser.add_argument(
"--unselected-accuracy-scorer",
metavar="SCORER",
default="combined:mass-spec,mse")
parser.add_argument(
"--unselected-accuracy-scorer-num-samples",
type=int,
default=1000)
parser.add_argument(
"--unselected-accuracy-percentile-threshold",
type=float,
metavar="X",
default=95)
parser.add_argument(
"--min-models",
type=int,
......@@ -122,15 +97,14 @@ def run(argv=sys.argv[1:]):
print("Loaded: %s" % input_predictor)
alleles = input_predictor.supported_alleles
(min_peptide_length, max_peptide_length) = (
input_predictor.supported_peptide_lengths)
metadata_dfs = {}
df = pandas.read_csv(args.data)
print("Loaded data: %s" % (str(df.shape)))
(min_peptide_length, max_peptide_length) = (
input_predictor.supported_peptide_lengths)
df = df.ix[
df = df.loc[
(df.peptide.str.len() >= min_peptide_length) &
(df.peptide.str.len() <= max_peptide_length)
]
......@@ -141,26 +115,10 @@ def run(argv=sys.argv[1:]):
# Allele names in data are assumed to be already normalized.
df = df.loc[df.allele.isin(alleles)].dropna()
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
if args.exclude_data:
exclude_df = pandas.read_csv(args.exclude_data)
metadata_dfs["model_selection_exclude"] = exclude_df
print("Loaded exclude data: %s" % (str(df.shape)))
df["_key"] = df.allele + "__" + df.peptide
exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
df["_excluded"] = df._key.isin(exclude_df._key.unique())
print("Excluding measurements per allele (counts): ")
print(df.groupby("allele")._excluded.sum())
print("Subselected to supported alleles: %s" % str(df.shape))
print("Excluding measurements per allele (fractions): ")
print(df.groupby("allele")._excluded.mean())
df = df.loc[~df._excluded]
del df["_excluded"]
del df["_key"]
print("Reduced data to: %s" % (str(df.shape)))
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
metadata_dfs["model_selection_data"] = df
......@@ -168,101 +126,9 @@ def run(argv=sys.argv[1:]):
args.mass_spec_regex)
if args.out_unselected_predictions:
df["unselected_prediction"] = input_predictor.predict(
alleles=df.allele.values,
peptides=df.peptide.values)
df.to_csv(args.out_unselected_predictions)
print("Wrote: %s" % args.out_unselected_predictions)
selectors = {}
selector_to_model_selection_kwargs = {}
def make_selector(
scoring,
combined_min_contribution_percent=args.combined_min_contribution_percent):
if scoring in selectors:
return (
selectors[scoring], selector_to_model_selection_kwargs[scoring])
start = time.time()
if scoring.startswith("combined:"):
model_selection_kwargs = {
'min_models': args.combined_min_models,
'max_models': args.combined_max_models,
}
component_selectors = []
for component_selector in scoring.split(":", 1)[1].split(","):
component_selectors.append(
make_selector(
component_selector)[0])
selector = CombinedModelSelector(
component_selectors,
min_contribution_percent=combined_min_contribution_percent)
elif scoring == "mse":
model_selection_kwargs = {
'min_models': args.mse_min_models,
'max_models': args.mse_max_models,
}
min_measurements = args.mse_min_measurements
selector = MSEModelSelector(
df=df.loc[~df.mass_spec],
predictor=input_predictor,
min_measurements=min_measurements)
elif scoring == "mass-spec":
mass_spec_df = df.loc[df.mass_spec]
model_selection_kwargs = {
'min_models': args.mass_spec_min_models,
'max_models': args.mass_spec_max_models,
}
min_measurements = args.mass_spec_min_measurements
selector = MassSpecModelSelector(
df=mass_spec_df,
predictor=input_predictor,
min_measurements=min_measurements)
elif scoring == "consensus":
model_selection_kwargs = {
'min_models': args.consensus_min_models,
'max_models': args.consensus_max_models,
}
selector = ConsensusModelSelector(
predictor=input_predictor,
num_peptides_per_length=args.consensus_num_peptides_per_length)
else:
raise ValueError("Unsupported scoring method: %s" % scoring)
print("Instantiated model selector %s in %0.2f sec." % (
scoring, time.time() - start))
return (selector, model_selection_kwargs)
for scoring in args.scoring:
(selector, model_selection_kwargs) = make_selector(scoring)
selectors[scoring] = selector
selector_to_model_selection_kwargs[scoring] = model_selection_kwargs
unselected_accuracy_scorer = None
if args.unselected_accuracy_scorer:
# Force running all selectors by setting combined_min_contribution_percent=0.
unselected_accuracy_scorer = make_selector(
args.unselected_accuracy_scorer,
combined_min_contribution_percent=0.0)[0]
print("Using unselected accuracy scorer: %s" % unselected_accuracy_scorer)
GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer
print("Selectors for alleles:")
allele_to_selector = {}
allele_to_model_selection_kwargs = {}
for allele in alleles:
selector = None
for possible_selector in args.scoring:
if selectors[possible_selector].usable_for_allele(allele=allele):
selector = selectors[possible_selector]
print("%20s %s" % (allele, selector.plan_summary(allele)))
break
if selector is None:
raise ValueError("No selectors usable for allele: %s" % allele)
allele_to_selector[allele] = selector
allele_to_model_selection_kwargs[allele] = (
selector_to_model_selection_kwargs[possible_selector])
GLOBAL_DATA["args"] = args
GLOBAL_DATA["input_predictor"] = input_predictor
......
......@@ -9,6 +9,7 @@ import time
import traceback
import random
import pprint
import hashlib
from functools import partial
import numpy
......@@ -130,6 +131,7 @@ add_worker_pool_args(parser)
def assign_folds(df, num_folds, held_out_fraction, held_out_max):
result_df = pandas.DataFrame(index=df.index)
for fold in range(num_folds):
result_df["fold_%d" % fold] = True
for (allele, sub_df) in df.groupby("allele"):
......@@ -172,6 +174,9 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max):
print("Test points per fold")
print((~result_df).sum())
result_df["allele"] = df["allele"]
result_df["peptide"] = df["peptide"]
return result_df
......@@ -422,8 +427,6 @@ def train_model(
progress_print_interval,
predictor,
save_to):
import keras.backend as K
import keras
df = GLOBAL_DATA["train_data"]
folds_df = GLOBAL_DATA["folds_df"]
......@@ -484,10 +487,10 @@ def train_model(
epochs=pretrain_max_epochs,
verbose=verbose,
)
if model.hyperparameters['learning_rate']:
model.hyperparameters['learning_rate'] /= 10
else:
model.hyperparameters['learning_rate'] = 0.0001
# Use a smaller learning rate for training on real data
learning_rate = model.fit_info[-1]["learning_rate"]
model.hyperparameters['learning_rate'] = learning_rate / 10
model.fit(
peptides=train_peptides,
......@@ -500,6 +503,20 @@ def train_model(
progress_print_interval=progress_print_interval,
verbose=verbose)
# Save model-specific training info
train_peptide_hash = hashlib.sha1()
for peptide in train_data.peptide.values:
train_peptide_hash.update(peptide.encode())
model.fit_info[-1]["training_info"] = {
"fold_num": fold_num,
"num_folds": num_folds,
"replicate_num": replicate_num,
"num_replicates": num_replicates,
"architecture_num": architecture_num,
"num_architectures": num_architectures,
"train_peptide_hash": train_peptide_hash.hexdigest(),
}
numpy.testing.assert_equal(
predictor.manifest_df.shape[0], len(predictor.class1_pan_allele_models))
predictor.add_pan_allele_model(model, models_dir_for_save=save_to)
......
......@@ -101,7 +101,7 @@ HYPERPARAMETERS_LIST = [
][1:]
def run_and_check(n_jobs=0):
def run_and_check(n_jobs=0, delete=True):
models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
hyperparameters_filename = os.path.join(
models_dir, "hyperparameters.yaml")
......@@ -140,8 +140,9 @@ def run_and_check(n_jobs=0):
alleles=["HLA-A*02:01"])
print(df)
print("Deleting: %s" % models_dir)
shutil.rmtree(models_dir)
if delete:
print("Deleting: %s" % models_dir)
shutil.rmtree(models_dir)
if os.environ.get("KERAS_BACKEND") != "theano":
......@@ -153,5 +154,6 @@ if os.environ.get("KERAS_BACKEND") != "theano":
def test_run_serial():
run_and_check(n_jobs=0)
if __name__ == "__main__":
test_run_serial()
\ No newline at end of file
run_and_check(n_jobs=0, delete=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment