Skip to content
Snippets Groups Projects
Commit 1cbb85cf authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

first cut on downloads-generation/models_class1_pan_refined

parent 5304b975
No related branches found
No related tags found
No related merge requests found
......@@ -36,42 +36,58 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
#exec > >(tee -ia "$LOG")
#exec 2> >(tee -ia "$LOG" >&2)
LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
exec > >(tee -ia "$LOG")
exec 2> >(tee -ia "$LOG" >&2)
# Log some environment info
#echo "Invocation: $0 $@"
#date
#pip freeze
#git status
echo "Invocation: $0 $@"
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cd /var/folders/jc/fyrvcrcs3sb8g4mkdg6nl_t80000gq/T/mhcflurry-downloads-generation/models_class1_pan_refined
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
# CAHNGE TO CP
ln -s $SCRIPT_DIR/make_multiallelic_training_data.py .
cp $SCRIPT_DIR/make_multiallelic_training_data.py .
cp $SCRIPT_DIR/hyperparameters.yaml .
time python make_multiallelic_training_data.py \
--hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
--expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
--out train.multiallelic.csv
MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2"
ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') )
time mhcflurry-multiallelic-refinement \
--monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--monoallelic-data "$MONOALLELIC_TRAIN" \
--multiallelic-data train.multiallelic.csv \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec"
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
--hyperparameters hyperparameters.yaml \
--out-affinity-predictor-dir $(pwd)/models.affinity \
--out-presentation-predictor-dir $(pwd)/models.presentation \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
$PARALLELISM_ARGS
time mhcflurry-calibrate-percentile-ranks \
--models-dir $(pwd)/models.affinity \
--match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele $ALLELE_LIST \
--verbosity 1 \
$PARALLELISM_ARGS
echo "Done training."
bzip2 train.multiallelic.csv
rm train.multiallelic.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 -f "$LOG"
......@@ -79,4 +95,3 @@ for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
../models_class1_pan/additional_alleles.txt
\ No newline at end of file
../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
-
#########################
# Batch generation
#########################
batch_generator_validation_split: 0.1,
batch_generator_batch_size: 1024,
batch_generator_affinity_fraction: 0.5
\ No newline at end of file
#########################
# Batch generation
#########################
batch_generator_validation_split: 0.1
batch_generator_batch_size: 1024
batch_generator_affinity_fraction: 0.5
max_epochs: 500
......@@ -194,7 +194,7 @@ class MultiallelicMassSpecBatchGenerator(object):
experiment_names,
alleles_matrix,
is_binder,
potential_validation_mask=None):
validation_weights=None):
affinities_mask = numpy.array(affinities_mask, copy=False, dtype=bool)
experiment_names = numpy.array(experiment_names, copy=False)
alleles_matrix = numpy.array(alleles_matrix, copy=False)
......@@ -206,14 +206,18 @@ class MultiallelicMassSpecBatchGenerator(object):
numpy.testing.assert_equal(len(is_binder), n)
numpy.testing.assert_equal(
affinities_mask, pandas.isnull(experiment_names))
if potential_validation_mask is not None:
numpy.testing.assert_equal(len(potential_validation_mask), n)
if validation_weights is not None:
validation_weights = numpy.array(
validation_weights, copy=True, dtype=float)
numpy.testing.assert_equal(len(validation_weights), n)
validation_weights /= validation_weights.sum()
validation_items = numpy.random.choice(
n if potential_validation_mask is None
else numpy.where(potential_validation_mask)[0],
int(self.hyperparameters['batch_generator_validation_split'] * n),
replace=False)
n,
int((self.hyperparameters['batch_generator_validation_split']) * n),
replace=False,
p=validation_weights)
validation_mask = numpy.zeros(n, dtype=bool)
validation_mask[validation_items] = True
......
......@@ -239,7 +239,7 @@ class Class1PresentationNeuralNetwork(object):
def logit(x):
import tensorflow as tf
return - tf.log(
return - tf.math.log(
tf.maximum(
tf.math.divide_no_nan(1., x) - 1.,
0.0))
......@@ -325,6 +325,7 @@ class Class1PresentationNeuralNetwork(object):
allele_encoding,
affinities_mask=None, # True when a peptide/label is actually a peptide and an affinity
inequalities=None, # interpreted only for elements where affinities_mask is True, otherwise ignored
validation_weights=None,
verbose=1,
progress_callback=None,
progress_preamble="",
......@@ -349,6 +350,10 @@ class Class1PresentationNeuralNetwork(object):
affinities_mask = numpy.array(affinities_mask, copy=False)
else:
affinities_mask = numpy.tile(False, len(labels))
if validation_weights is None:
validation_weights = numpy.tile(1.0, len(labels))
else:
validation_weights = numpy.array(validation_weights, copy=False)
inequalities[~affinities_mask] = "="
random_negatives_planner = RandomNegativePeptides(
......@@ -486,9 +491,9 @@ class Class1PresentationNeuralNetwork(object):
numpy.tile(False, num_random_negatives),
numpy.where(affinities_mask, labels, to_ic50(labels)) < 1000.0
]),
potential_validation_mask=numpy.concatenate([
numpy.tile(False, num_random_negatives),
numpy.tile(True, len(labels))
validation_weights=numpy.concatenate([
numpy.tile(0.0, num_random_negatives),
validation_weights
]),
)
if verbose:
......
......@@ -25,7 +25,8 @@ from .regression_target import from_ic50, to_ic50
from .allele_encoding import MultipleAlleleEncoding
from .downloads import get_default_class1_presentation_models_dir
from .class1_presentation_neural_network import Class1PresentationNeuralNetwork
from .common import save_weights, load_weights
from .common import save_weights, load_weights, NumpyJSONEncoder
class Class1PresentationPredictor(object):
def __init__(
......@@ -55,7 +56,7 @@ class Class1PresentationPredictor(object):
model_config = model.get_config()
rows.append((
self.model_name(i),
json.dumps(model_config),
json.dumps(model_config, cls=NumpyJSONEncoder),
model
))
self._manifest_df = pandas.DataFrame(
......@@ -234,7 +235,7 @@ class Class1PresentationPredictor(object):
updated_network_config_jsons = []
for (_, row) in sub_manifest_df.iterrows():
updated_network_config_jsons.append(
json.dumps(row.model.get_config()))
json.dumps(row.model.get_config(), cls=NumpyJSONEncoder))
weights_path = self.weights_path(models_dir, row.model_name)
save_weights(row.model.get_weights(), weights_path)
logging.info("Wrote: %s", weights_path)
......
......@@ -4,6 +4,7 @@ import logging
import sys
import os
import warnings
import json
import numpy
import pandas
......@@ -206,3 +207,19 @@ def load_weights(filename):
with numpy.load(filename) as loaded:
weights = [loaded["array_%d" % i] for i in range(len(loaded.keys()))]
return weights
class NumpyJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (
numpy.int_, numpy.intc, numpy.intp, numpy.int8,
numpy.int16, numpy.int32, numpy.int64, numpy.uint8,
numpy.uint16, numpy.uint32, numpy.uint64)):
return int(obj)
elif isinstance(obj, (
numpy.float_, numpy.float16, numpy.float32,
numpy.float64)):
return float(obj)
if isinstance(obj, numpy.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
\ No newline at end of file
......@@ -8,7 +8,8 @@ import sys
import time
import traceback
import hashlib
from pprint import pprint
import yaml
import pickle
import numpy
import pandas
......@@ -17,8 +18,9 @@ import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .encodable_sequences import EncodableSequences
from .allele_encoding import AlleleEncoding
from .class1_presentation_neural_network import Class1PresentationNeuralNetwork
from .class1_presentation_predictor import Class1PresentationPredictor
from .allele_encoding import MultipleAlleleEncoding
from .common import configure_logging
from .local_parallelism import (
worker_pool_with_gpu_assignments_from_args,
......@@ -26,7 +28,6 @@ from .local_parallelism import (
from .cluster_parallelism import (
add_cluster_parallelism_args,
cluster_results_from_args)
from .regression_target import from_ic50
# To avoid pickling large matrices to send to child processes when running in
......@@ -47,8 +48,7 @@ parser.add_argument(
"--monoallelic-data",
metavar="FILE.csv",
required=False,
help=(
"Affinity meaurements and monoallelic mass spec data."))
help="Affinity meaurements and monoallelic mass spec data.")
parser.add_argument(
"--models-dir",
metavar="DIR",
......@@ -87,6 +87,8 @@ def run(argv=sys.argv[1:]):
args = parser.parse_args(argv)
hyperparameters = yaml.load(open(args.hyperparameters))
args.out_affinity_predictor_dir = os.path.abspath(
args.out_affinity_predictor_dir)
args.out_presentation_predictor_dir = os.path.abspath(
......@@ -94,7 +96,7 @@ def run(argv=sys.argv[1:]):
configure_logging(verbose=args.verbosity > 1)
multiallelic_df = pandas.read_csv(args.multiallelic_df)
multiallelic_df = pandas.read_csv(args.multiallelic_data)
print("Loaded multiallelic data: %s" % (str(multiallelic_df.shape)))
monoallelic_df = pandas.read_csv(args.monoallelic_data)
......@@ -104,91 +106,69 @@ def run(argv=sys.argv[1:]):
args.models_dir, optimization_level=0)
print("Loaded: %s" % input_predictor)
import ipdb ; ipdb.set_trace()
alleles = input_predictor.supported_alleles
(min_peptide_length, max_peptide_length) = (
input_predictor.supported_peptide_lengths)
metadata_dfs = {}
sample_table = multiallelic_df.drop_duplicates(
"sample_id").set_index("sample_id").loc[
multiallelic_df.sample_id.unique()
]
grouped = multiallelic_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
fold_cols = [c for c in df if c.startswith("fold_")]
fold_cols = [c for c in monoallelic_df if c.startswith("fold_")]
num_folds = len(fold_cols)
if num_folds <= 1:
raise ValueError("Too few folds: ", num_folds)
df = df.loc[
(df.peptide.str.len() >= min_peptide_length) &
(df.peptide.str.len() <= max_peptide_length)
]
print("Subselected to %d-%dmers: %s" % (
min_peptide_length, max_peptide_length, str(df.shape)))
print("Num folds: ", num_folds, "fraction included:")
print(df[fold_cols].mean())
# Allele names in data are assumed to be already normalized.
df = df.loc[df.allele.isin(alleles)].dropna()
print("Subselected to supported alleles: %s" % str(df.shape))
metadata_dfs["model_selection_data"] = df
df["mass_spec"] = df.measurement_source.str.contains(
args.mass_spec_regex)
def make_train_peptide_hash(sub_df):
train_peptide_hash = hashlib.sha1()
for peptide in sorted(sub_df.peptide.values):
train_peptide_hash.update(peptide.encode())
return train_peptide_hash.hexdigest()
folds_to_predictors = dict(
(int(col.split("_")[-1]), (
[],
make_train_peptide_hash(df.loc[df[col] == 1])))
for col in fold_cols)
print(folds_to_predictors)
work_items = []
for model in input_predictor.class1_pan_allele_models:
training_info = model.fit_info[-1]['training_info']
fold_num = training_info['fold_num']
assert num_folds == training_info['num_folds']
(lst, hash) = folds_to_predictors[fold_num]
train_peptide_hash = training_info['train_peptide_hash']
numpy.testing.assert_equal(hash, train_peptide_hash)
lst.append(model)
work_items = []
for (fold_num, (models, _)) in folds_to_predictors.items():
fold_col = "fold_%d" % fold_num
observed_hash = make_train_peptide_hash(
monoallelic_df.loc[monoallelic_df[fold_col] == 1])
saved_hash = training_info['train_peptide_hash']
#numpy.testing.assert_equal(observed_hash, saved_hash)
work_items.append({
'fold_num': fold_num,
'models': models,
'min_models': args.min_models_per_fold,
'max_models': args.max_models_per_fold,
"work_item_num": len(work_items),
"affinity_model": model,
"fold_num": fold_num,
})
GLOBAL_DATA["data"] = df
GLOBAL_DATA["input_predictor"] = input_predictor
work_items_dict = dict((item['work_item_num'], item) for item in work_items)
if not os.path.exists(args.out_models_dir):
print("Attempting to create directory: %s" % args.out_models_dir)
os.mkdir(args.out_models_dir)
print("Done.")
GLOBAL_DATA["monoallelic_data"] = monoallelic_df
GLOBAL_DATA["multiallelic_data"] = multiallelic_df
GLOBAL_DATA["multiallelic_sample_table"] = sample_table
GLOBAL_DATA["hyperparameters"] = hyperparameters
GLOBAL_DATA["allele_to_sequence"] = input_predictor.allele_to_sequence
result_predictor = Class1AffinityPredictor(
allele_to_sequence=input_predictor.allele_to_sequence,
metadata_dataframes=metadata_dfs)
out_dirs = [
args.out_affinity_predictor_dir,
args.out_presentation_predictor_dir
]
for out in out_dirs:
if not os.path.exists(out):
print("Attempting to create directory:", out)
os.mkdir(out)
print("Done.")
metadata_dfs = {
"monoallelic_train_data": monoallelic_df,
"multiallelic_train_data": multiallelic_df,
}
affinity_models = []
presentation_models = []
serial_run = not args.cluster_parallelism and args.num_jobs == 0
worker_pool = None
......@@ -196,13 +176,13 @@ def run(argv=sys.argv[1:]):
if serial_run:
# Serial run
print("Running in serial.")
results = (model_select(**item) for item in work_items)
results = (refine_model(**item) for item in work_items)
elif args.cluster_parallelism:
# Run using separate processes HPC cluster.
print("Running on cluster.")
results = cluster_results_from_args(
args,
work_function=model_select,
work_function=refine_model,
work_items=work_items,
constant_data=GLOBAL_DATA,
result_serialization_method="pickle")
......@@ -216,131 +196,109 @@ def run(argv=sys.argv[1:]):
# Parallel run
results = worker_pool.imap_unordered(
do_model_select_task,
do_refine_model_task,
work_items,
chunksize=1)
models_by_fold = {}
summary_dfs = []
for result in tqdm.tqdm(results, total=len(work_items)):
pprint(result)
fold_num = result['fold_num']
(all_models_for_fold, _) = folds_to_predictors[fold_num]
models = [
all_models_for_fold[i]
for i in result['selected_indices']
]
summary_df = result['summary'].copy()
summary_df.index = summary_df.index.map(
lambda idx: all_models_for_fold[idx])
summary_dfs.append(summary_df)
print("Selected %d models for fold %d: %s" % (
len(models), fold_num, result['selected_indices']))
models_by_fold[fold_num] = models
for model in models:
model.clear_allele_representations()
result_predictor.add_pan_allele_model(model)
summary_df = pandas.concat(summary_dfs, ignore_index=False)
summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config())
result_predictor.metadata_dataframes["model_selection_summary"] = (
summary_df.reset_index(drop=True))
result_predictor.save(args.out_models_dir)
model_selection_time = time.time() - start
work_item_num = result['work_item_num']
work_item = work_items_dict[work_item_num]
affinity_model = work_item['affinity_model']
presentation_model = pickle.loads(result['presentation_model'])
presentation_model.copy_weights_to_affinity_model(affinity_model)
affinity_models.append(affinity_model)
presentation_models.append(presentation_model)
if worker_pool:
worker_pool.close()
worker_pool.join()
print("Model selection time %0.2f min." % (model_selection_time / 60.0))
print("Predictor [%d models] written to: %s" % (
len(result_predictor.neural_networks),
args.out_models_dir))
print("Done model fitting. Writing predictors.")
result_affinity_predictor = Class1AffinityPredictor(
class1_pan_allele_models=affinity_models,
allele_to_sequence=input_predictor.allele_to_sequence)
result_affinity_predictor.save(args.out_affinity_predictor_dir)
print("Wrote", args.out_affinity_predictor_dir)
result_presentation_predictor = Class1PresentationPredictor(
models=presentation_models,
allele_to_sequence=input_predictor.allele_to_sequence,
metadata_dataframes=metadata_dfs)
result_presentation_predictor.save(args.out_presentation_predictor_dir)
print("Wrote", args.out_presentation_predictor_dir)
def do_model_select_task(item, constant_data=GLOBAL_DATA):
return model_select(constant_data=constant_data, **item)
def do_refine_model_task(item, constant_data=GLOBAL_DATA):
return refine_model(constant_data=constant_data, **item)
def model_select(
fold_num, models, min_models, max_models, constant_data=GLOBAL_DATA):
def refine_model(
work_item_num, affinity_model, fold_num, constant_data=GLOBAL_DATA):
"""
Model select for a fold.
Parameters
----------
fold_num : int
models : list of Class1NeuralNetwork
min_models : int
max_models : int
constant_data : dict
Returns
-------
dict with keys 'fold_num', 'selected_indices', 'summary'
Refine a model.
"""
full_data = constant_data["data"]
input_predictor = constant_data["input_predictor"]
df = full_data.loc[
full_data["fold_%d" % fold_num] == 0
]
peptides = EncodableSequences.create(df.peptide.values)
alleles = AlleleEncoding(
df.allele.values,
borrow_from=input_predictor.master_allele_encoding)
predictions_df = df.copy()
for (i, model) in enumerate(models):
predictions_df[i] = from_ic50(model.predict(peptides, alleles))
actual = from_ic50(predictions_df.measurement_value)
selected = []
selected_score = 0
remaining_models = set(numpy.arange(len(models)))
individual_model_scores = {}
while remaining_models and len(selected) < max_models:
best_model = None
best_model_score = 0
for i in remaining_models:
possible_ensemble = list(selected) + [i]
predictions = predictions_df[possible_ensemble].mean(1)
mse_score = 1 - mse(
predictions,
actual,
inequalities=(
predictions_df.measurement_inequality
if 'measurement_inequality' in predictions_df.columns
else None),
affinities_are_already_01_transformed=True)
if mse_score >= best_model_score:
best_model = i
best_model_score = mse_score
if not selected:
# First iteration. Store individual model scores.
individual_model_scores[i] = mse_score
if len(selected) < min_models or best_model_score > selected_score:
selected_score = best_model_score
remaining_models.remove(best_model)
selected.append(best_model)
else:
break
assert selected
summary_df = pandas.Series(individual_model_scores)[
numpy.arange(len(models))
].to_frame()
summary_df.columns = ['mse_score']
monoallelic_df = constant_data["monoallelic_data"]
multiallelic_df = constant_data["multiallelic_data"]
sample_table = constant_data["multiallelic_sample_table"]
hyperparameters = constant_data["hyperparameters"]
allele_to_sequence = constant_data["allele_to_sequence"]
fold_col = "fold_%d" % fold_num
multiallelic_train_df = multiallelic_df[
["sample_id", "peptide", "hit"]
].rename(columns={"hit": "label"}).copy()
multiallelic_train_df["is_affinity"] = False
multiallelic_train_df["validation_weight"] = 1.0
monoallelic_train_df = monoallelic_df[
["peptide", "allele", "measurement_inequality", "measurement_value"]
].copy()
monoallelic_train_df["label"] = monoallelic_train_df["measurement_value"]
del monoallelic_train_df["measurement_value"]
monoallelic_train_df["is_affinity"] = True
# We force all validation affinities to be from the validation set used
# originally to train the predictor. To ensure proportional sampling between
# the affinity and multiallelic mass spec data, we set their weight to
# as follows.
monoallelic_train_df["validation_weight"] = (
(monoallelic_df[fold_col] == 0).astype(float) * (
(monoallelic_df[fold_col] == 1).sum() /
(monoallelic_df[fold_col] == 0).sum()))
combined_train_df = pandas.concat(
[multiallelic_train_df, monoallelic_train_df],
ignore_index=True,
sort=False)
allele_encoding = MultipleAlleleEncoding(
experiment_names=multiallelic_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(),
allele_to_sequence=allele_to_sequence,
)
allele_encoding.append_alleles(monoallelic_train_df.allele.values)
allele_encoding = allele_encoding.compact()
presentation_model = Class1PresentationNeuralNetwork(**hyperparameters)
presentation_model.load_from_class1_neural_network(affinity_model)
presentation_model.fit(
peptides=combined_train_df.peptide.values,
labels=combined_train_df.label.values,
allele_encoding=allele_encoding,
affinities_mask=combined_train_df.is_affinity.values,
inequalities=combined_train_df.measurement_inequality.values,
validation_weights=combined_train_df.validation_weight.values)
return {
'fold_num': fold_num,
'selected_indices': selected,
'summary': summary_df, # indexed by model index
'work_item_num': work_item_num,
# We pickle it here so it always gets pickled, even when running in one
# process. This prevents tensorflow errors when using thread-level
# parallelism.
'presentation_model': pickle.dumps(
presentation_model, pickle.HIGHEST_PROTOCOL),
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment