Skip to content
Snippets Groups Projects
Commit 4a2f6a18 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent a351f3ff
No related merge requests found
......@@ -61,7 +61,7 @@ do
--allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--ensemble-size 4 \
--num-folds 4 \
--hyperparameters hyperparameters.yaml \
--out-models-dir $(pwd)/models.${kind} \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
......
......@@ -73,7 +73,7 @@ do
--allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \
--pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \
--held-out-measurements-per-allele-fraction-and-max 0.25 100 \
--ensemble-size 4 \
--num-folds 4 \
--hyperparameters hyperparameters.yaml \
--out-models-dir models.${kind} \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
......
......@@ -51,11 +51,6 @@ parser.add_argument(
help=(
"Model selection data CSV. Expected columns: "
"allele, peptide, measurement_value"))
parser.add_argument(
"--folds",
metavar="FILE.csv",
required=False,
help=(""))
parser.add_argument(
"--models-dir",
metavar="DIR",
......@@ -161,19 +156,6 @@ def run(argv=sys.argv[1:]):
metadata_dfs = {}
if args.folds:
folds_df = pandas.read_csv(args.folds)
matches = all([
len(folds_df) == len(df),
(folds_df.peptide == df.peptide).all(),
(folds_df.allele == df.allele).all(),
])
if not matches:
raise ValueError("Training data and fold data do not match")
fold_cols = [c for c in folds_df if c.startswith("fold_")]
for col in fold_cols:
df[col] = folds_df[col]
fold_cols = [c for c in df if c.startswith("fold_")]
num_folds = len(fold_cols)
if num_folds <= 1:
......@@ -193,8 +175,6 @@ def run(argv=sys.argv[1:]):
df = df.loc[df.allele.isin(alleles)].dropna()
print("Subselected to supported alleles: %s" % str(df.shape))
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
metadata_dfs["model_selection_data"] = df
df["mass_spec"] = df.measurement_source.str.contains(
......@@ -248,13 +228,13 @@ def run(argv=sys.argv[1:]):
if serial_run:
# Serial run
print("Running in serial.")
results = (do_model_select_task(item) for item in work_items)
results = (model_select(**item) for item in work_items)
elif args.cluster_parallelism:
# Run using separate processes HPC cluster.
print("Running on cluster.")
results = cluster_results_from_args(
args,
work_function=do_model_select_task,
work_function=model_select,
work_items=work_items,
constant_data=GLOBAL_DATA,
result_serialization_method="pickle")
......@@ -268,7 +248,9 @@ def run(argv=sys.argv[1:]):
# Parallel run
results = worker_pool.imap_unordered(
do_model_select_task, work_items, chunksize=1)
do_model_select_task,
work_items,
chunksize=1)
models_by_fold = {}
summary_dfs = []
......
......@@ -84,11 +84,11 @@ parser.add_argument(
default=False,
help="Do not use affinity value inequalities even when present in data")
parser.add_argument(
"--ensemble-size",
"--num-folds",
type=int,
default=4,
metavar="N",
help="Ensemble size, i.e. how many models to retain the final predictor. "
"In the current implementation, this is also the number of training folds.")
help="Number of training folds.")
parser.add_argument(
"--num-replicates",
type=int,
......@@ -296,7 +296,7 @@ def initialize_training(args):
"data",
"out_models_dir",
"hyperparameters",
"ensemble_size",
"num_folds",
]
for arg in required_arguments:
if getattr(args, arg) is None:
......@@ -338,7 +338,7 @@ def initialize_training(args):
folds_df = assign_folds(
df=df,
num_folds=args.ensemble_size,
num_folds=args.num_folds,
held_out_fraction=held_out_fraction,
held_out_max=held_out_max)
......@@ -387,14 +387,14 @@ def initialize_training(args):
if not args.pretrain_data:
raise ValueError("--pretrain-data is required")
for fold in range(args.ensemble_size):
for fold in range(args.num_folds):
for replicate in range(args.num_replicates):
work_dict = {
'work_item_name': str(uuid.uuid4()),
'architecture_num': h,
'num_architectures': len(hyperparameters_lst),
'fold_num': fold,
'num_folds': args.ensemble_size,
'num_folds': args.num_folds,
'replicate_num': replicate,
'num_replicates': args.num_replicates,
'hyperparameters': hyperparameters,
......
......@@ -2,6 +2,11 @@
Tests for training and predicting using Class1 pan-allele models.
"""
import logging
logging.getLogger('tensorflow').disabled = True
logging.getLogger('matplotlib').disabled = True
import json
import os
import shutil
......@@ -36,7 +41,7 @@ HYPERPARAMETERS_LIST = [
'learning_rate': None,
'locally_connected_layers': [],
'loss': 'custom:mse_with_inequalities',
'max_epochs': 5,
'max_epochs': 0, # never selected
'minibatch_size': 256,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
......@@ -100,7 +105,7 @@ HYPERPARAMETERS_LIST = [
},
'validation_split': 0.1,
},
][1:]
]
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
......@@ -114,37 +119,47 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]):
get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2"))
selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")]
selected_data_df.to_csv(
os.path.join(models_dir, "train_data.csv"), index=False)
os.path.join(models_dir, "_train_data.csv"), index=False)
args = [
"mhcflurry-class1-train-pan-allele-models",
"--data", os.path.join(models_dir, "train_data.csv"),
"--data", os.path.join(models_dir, "_train_data.csv"),
"--allele-sequences", get_path("allele_sequences", "allele_sequences.csv"),
"--hyperparameters", hyperparameters_filename,
"--out-models-dir", models_dir,
"--num-jobs", str(n_jobs),
"--ensemble-size", "2",
"--num-folds", "2",
"--verbosity", "1",
# "--pretrain-data", get_path(
# "random_peptide_predictions", "predictions.csv.bz2"),
] + additional_args
print("Running with args: %s" % args)
subprocess.check_call(args)
result = Class1AffinityPredictor.load(models_dir)
predictions = result.predict(
peptides=["SLYNTVATL"],
# Run model selection
models_dir_selected = tempfile.mkdtemp(
prefix="mhcflurry-test-models-selected")
args = [
"mhcflurry-class1-select-pan-allele-models",
"--data", os.path.join(models_dir, "train_data.csv.bz2"),
"--models-dir", models_dir,
"--out-models-dir", models_dir_selected,
"--max-models", "1",
"--num-jobs", str(n_jobs),
] + additional_args
print("Running with args: %s" % args)
subprocess.check_call(args)
result = Class1AffinityPredictor.load(
models_dir_selected, optimization_level=0)
assert_equal(len(result.neural_networks), 2)
predictions = result.predict(peptides=["SLYNTVATL"],
alleles=["HLA-A*02:01"])
assert_equal(predictions.shape, (1,))
assert_array_less(predictions, 1000)
df = result.predict_to_dataframe(
peptides=["SLYNTVATL"],
alleles=["HLA-A*02:01"])
print(df)
if delete:
print("Deleting: %s" % models_dir)
shutil.rmtree(models_dir)
shutil.rmtree(models_dir_selected)
if os.environ.get("KERAS_BACKEND") != "theano":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment