diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..c3aa7fad9441be3d33ba7815855857b7409aab9d --- /dev/null +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Model select pan-allele MHCflurry Class I models and calibrate percentile ranks. +# +set -e +set -x + +DOWNLOAD_NAME=models_class1_pan +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +cp $SCRIPT_ABSOLUTE_PATH . + +GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 +echo "Detected GPUS: $GPUS" + +PROCESSORS=$(getconf _NPROCESSORS_ONLN) +echo "Detected processors: $PROCESSORS" + +NUM_JOBS=$GPUS +if [ "$NUM_JOBS" -eq "0" ]; then + NUM_JOBS=1 +fi +echo "Num jobs: $NUM_JOBS" + +export PYTHONUNBUFFERED=1 + +UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" + +for kind in with_mass_spec #no_mass_spec +do + MODELS_DIR="$UNSELECTED_PATH/models.${kind}" + time mhcflurry-class1-select-pan-allele-models \ + --data "$MODELS_DIR/train_data.csv.bz2" \ + --models-dir "$MODELS_DIR" \ + --out-models-dir models.${kind} \ + --min-models 8 \ + --max-models 32 \ + --num-jobs 0 \ + --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 + + time mhcflurry-calibrate-percentile-ranks \ + --models-dir models.${kind} \ + --num-peptides-per-length 10000 \ + --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 +done + +bzip2 LOG.txt +for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.sh index b70d5b8d3d1a82aa911f68b504cce1d4b5ba6d2d..4be7648aec01ce91adae8370f3e0512fecd31dd8 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.sh @@ -34,9 +34,10 @@ echo "Detected GPUS: $GPUS" PROCESSORS=$(getconf _NPROCESSORS_ONLN) echo "Detected processors: $PROCESSORS" -NUM_JOBS=$GPUS -if [ "$NUM_JOBS" -eq "0" ]; then - NUM_JOBS=1 +if [ "$GPUS" -eq "0" ]; then + NUM_JOBS=${NUM_JOBS-1} +else + NUM_JOBS=${NUM_JOBS-$GPUS} fi echo "Num jobs: $NUM_JOBS" diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index 87243bdbf3cdaeba3a84277b03056d1acf0ea5eb..43cc9d8664c12d23fdf59880110af96e2b2d60f9 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -39,8 +39,8 @@ parser.add_argument( "--allele", default=None, nargs="+", - help="Alleles to train models for. If not specified, all alleles with " - "enough measurements will be used.") + help="Alleles to calibrate percentile ranks for. If not specified all " + "alleles are used") parser.add_argument( "--num-peptides-per-length", type=int, @@ -56,6 +56,7 @@ parser.add_argument( add_local_parallelism_args(parser) + def run(argv=sys.argv[1:]): global GLOBAL_DATA @@ -78,7 +79,8 @@ def run(argv=sys.argv[1:]): start = time.time() - print("Performing percent rank calibration. Encoding peptides.") + print("Performing percent rank calibration for %d alleles: encoding peptides" % ( + len(alleles))) encoded_peptides = predictor.calibrate_percentile_ranks( alleles=[], # don't actually do any calibration, just return peptides num_peptides_per_length=args.num_peptides_per_length) diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 3497e25d8a2437103d70dfe48d968578b53ed9e6..0c38373927c00da15ff86f835e688d75f605f075 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -80,7 +80,10 @@ class Class1AffinityPredictor(object): if class1_pan_allele_models is None: class1_pan_allele_models = [] - self.allele_to_sequence = allele_to_sequence + self.allele_to_sequence = ( + dict(allele_to_sequence) + if allele_to_sequence is not None else None) # make a copy + self.master_allele_encoding = None if class1_pan_allele_models: assert self.allele_to_sequence @@ -112,8 +115,7 @@ class Class1AffinityPredictor(object): json.dumps(model.get_config()), model )) - for (allele, - models) in self.allele_to_allele_specific_models.items(): + for (allele, models) in self.allele_to_allele_specific_models.items(): for (i, model) in enumerate(models): rows.append(( self.model_name(allele, i), @@ -126,6 +128,18 @@ class Class1AffinityPredictor(object): columns=["model_name", "allele", "config_json", "model"]) return self._manifest_df + def subselect_alleles(self, alleles): + if self.allele_to_sequence: + alleles = [ + mhcnames.normalize_allele_name(allele) + for allele in set(alleles) + ] + + allele_to_sequence = dict( + (a, self.allele_to_sequence[a]) for a in alleles) + self.allele_to_sequence = allele_to_sequence + self.clear_cache() + def clear_cache(self): """ Clear values cached based on the neural networks in this predictor. @@ -439,7 +453,7 @@ class Class1AffinityPredictor(object): if exists(join(models_dir, "allele_sequences.csv")): allele_to_fixed_length_sequence = pandas.read_csv( join(models_dir, "allele_sequences.csv"), - index_col="allele").sequence.to_dict() + index_col=0).iloc[:,0].to_dict() allele_to_percent_rank_transform = {} percent_ranks_path = join(models_dir, "percent_ranks.csv") diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 6e1b3148c381b2b838c7cf9ce42b337ce3d03a08..0be671aca5634d988114722396aabcb6a533c40c 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -1197,12 +1197,42 @@ class Class1NeuralNetwork(object): allele_representations """ - reshaped = allele_representations.reshape((allele_representations.shape[0], -1)) - layer = self.network().get_layer("allele_representation") - (existing,) = layer.get_weights() - if existing.shape == reshaped.shape: - layer.set_weights([reshaped]) - else: - raise NotImplementedError( - "Network surgery required: %s != %s" % ( - str(existing.shape), str(reshaped.shape))) + from keras.models import model_from_json + reshaped = allele_representations.reshape( + (allele_representations.shape[0], -1)) + original_model = self.network() + layer = original_model.get_layer("allele_representation") + (existing_weights,) = layer.get_weights() + + # Only changes to the number of supported alleles (not the length of + # the allele sequences) are allowed. + assert existing_weights.shape[1:] == reshaped.shape[1:] + + if existing_weights.shape != reshaped.shape: + # Network surgery required. Make a new network with this layer's + # dimensions changed. Kind of a hack. + layer.input_dim = reshaped.shape[0] + new_model = model_from_json(original_model.to_json()) + + # copy weights for other layers over + for layer in new_model.layers: + if layer.name != "allele_representation": + layer.set_weights( + original_model.get_layer(name=layer.name).get_weights()) + + self._network = new_model + self.update_network_description() + + layer = new_model.get_layer("allele_representation") + (existing_weights,) = layer.get_weights() + + # Disable the old model to catch bugs. + def throw(*args, **kwargs): + raise RuntimeError("Using a disabled model!") + original_model.predict = \ + original_model.fit = \ + original_model.fit_generator = throw + + assert existing_weights.shape == reshaped.shape, ( + existing_weights.shape, reshaped.shape) + layer.set_weights([reshaped]) diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index afc109f36c1324a93729c179572e91f62fce9766..b9ecff5259f3eefe3fc8a810e027622cd65222eb 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -20,7 +20,7 @@ releases: 2.0.0: compatibility-version: 2 downloads: - - name: model_class1_pan_unselected + - name: models_class1_pan_unselected url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/model_class1_pan_unselected.manual_build.20190730.tar.bz2 default: false diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index 12f53a3699655912c3fc2c30f6d6f6d146e79800..8102e7fed96285bcc14c0bc36c3d65a8cf40fdc9 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -123,6 +123,9 @@ def run(argv=sys.argv[1:]): configure_logging(verbose=args.verbosity > 1) + df = pandas.read_csv(args.data) + print("Loaded data: %s" % (str(df.shape))) + input_predictor = Class1AffinityPredictor.load(args.models_dir) print("Loaded: %s" % input_predictor) @@ -131,8 +134,6 @@ def run(argv=sys.argv[1:]): input_predictor.supported_peptide_lengths) metadata_dfs = {} - df = pandas.read_csv(args.data) - print("Loaded data: %s" % (str(df.shape))) if args.folds: folds_df = pandas.read_csv(args.folds) @@ -191,7 +192,7 @@ def run(argv=sys.argv[1:]): assert num_folds == training_info['num_folds'] (lst, hash) = folds_to_predictors[fold_num] train_peptide_hash = training_info['train_peptide_hash'] - #numpy.testing.assert_equal(hash, train_peptide_hash) #enable later + numpy.testing.assert_equal(hash, train_peptide_hash) lst.append(model) work_items = [] diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index a2bf3944912767fbb07dee585487afc7ba28b4ce..8e56bbd468e9508c0f95c6ae96c87bea42a6ebb7 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -129,6 +129,7 @@ parser.add_argument( add_local_parallelism_args(parser) add_cluster_parallelism_args(parser) + def assign_folds(df, num_folds, held_out_fraction, held_out_max): result_df = pandas.DataFrame(index=df.index) @@ -284,6 +285,14 @@ def main(args): print("Will use %d / %d allele sequences" % ( len(allele_sequences_in_use), len(allele_sequences))) + # All alleles, not just those with training data. + full_allele_encoding = AlleleEncoding( + alleles=allele_sequences.index.values, + allele_to_sequence=allele_sequences.to_dict() + ) + + # Only alleles with training data. For efficiency we perform model training + # using only these alleles in the neural network embedding layer. allele_encoding = AlleleEncoding( alleles=allele_sequences_in_use.index.values, allele_to_sequence=allele_sequences_in_use.to_dict()) @@ -366,9 +375,6 @@ def main(args): results_generator = worker_pool.imap_unordered( partial(call_wrapped_kwargs, train_model), - - - work_items, chunksize=1) else: @@ -411,7 +417,11 @@ def main(args): predictor.merge_in_place(unsaved_predictors) print("Saving final predictor to: %s" % args.out_models_dir) - predictor.save(args.out_models_dir) # write all models just to be sure + # We want the final predictor to support all alleles with sequences, not + # just those we actually used for model training. + predictor.allele_to_sequence = full_allele_encoding.allele_to_sequence + predictor.clear_cache() + predictor.save(args.out_models_dir) print("Done.") print("*" * 30) diff --git a/test/test_changing_allele_representations.py b/test/test_changing_allele_representations.py new file mode 100644 index 0000000000000000000000000000000000000000..84492fad51b2b1dca59d0ddc81a5e80efb68f15d --- /dev/null +++ b/test/test_changing_allele_representations.py @@ -0,0 +1,100 @@ +import time +import pandas + +from mhcflurry.allele_encoding import AlleleEncoding +from mhcflurry.amino_acid import BLOSUM62_MATRIX +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor +from mhcflurry.downloads import get_path + +from numpy.testing import assert_equal + +ALLELE_TO_SEQUENCE = pandas.read_csv( + get_path( + "allele_sequences", "allele_sequences.csv"), + index_col=0).sequence.to_dict() + +HYPERPARAMETERS = { + 'activation': 'tanh', + 'allele_dense_layer_sizes': [], + 'batch_normalization': False, + 'dense_layer_l1_regularization': 0.0, + 'dense_layer_l2_regularization': 0.0, + 'dropout_probability': 0.5, + 'early_stopping': True, + 'init': 'glorot_uniform', + 'layer_sizes': [4], + 'learning_rate': None, + 'locally_connected_layers': [], + 'loss': 'custom:mse_with_inequalities', + 'max_epochs': 40, + 'minibatch_size': 128, + 'optimizer': 'rmsprop', + 'output_activation': 'sigmoid', + 'patience': 2, + 'peptide_allele_merge_activation': '', + 'peptide_allele_merge_method': 'concatenate', + 'peptide_amino_acid_encoding': 'BLOSUM62', + 'peptide_dense_layer_sizes': [], + 'peptide_encoding': { + 'alignment_method': 'left_pad_centered_right_pad', + 'max_length': 15, + 'vector_encoding_name': 'BLOSUM62', + }, + 'random_negative_affinity_max': 50000.0, + 'random_negative_affinity_min': 20000.0, + 'random_negative_constant': 0, + 'random_negative_distribution_smoothing': 0.0, + 'random_negative_match_distribution': True, + 'random_negative_rate': 0.0, + 'train_data': {}, + 'validation_split': 0.1, +} + + +def test_changing_allele_representations(): + allele1 = "HLA-A*02:01" + allele2 = "HLA-C*03:04" + allele3 = "HLA-B*07:01" + + peptide = "SIINFEKL" + + allele_to_sequence = {} + for allele in [allele1, allele2]: + allele_to_sequence[allele] = ALLELE_TO_SEQUENCE[allele] + + data1 = [] + for i in range(5000): + data1.append((allele1, peptide, 0, "=")) + data1.append((allele2, peptide, 50000, "=")) + data1 = pandas.DataFrame( + data1, columns=["allele", "peptide", "affinity", "inequality"]) + + predictor = Class1AffinityPredictor(allele_to_sequence=allele_to_sequence) + predictor.fit_class1_pan_allele_models( + n_models=1, + architecture_hyperparameters=HYPERPARAMETERS, + alleles=data1.allele.values, + peptides=data1.peptide.values, + affinities=data1.affinity.values, + inequalities=data1.inequality.values) + + (value1, value2) = predictor.predict([peptide, peptide], alleles=[allele1, allele2]) + assert value1 < 100, value1 + assert value2 > 4000, value2 + + allele_to_sequence[allele3] = ALLELE_TO_SEQUENCE[allele3] + predictor.allele_to_sequence = allele_to_sequence + predictor.clear_cache() + + (value1, value2, value3) = predictor.predict( + [peptide, peptide, peptide], + alleles=[allele1, allele2, allele3]) + assert value1 < 100, value1 + assert value2 > 4000, value2 + + + + + + +