diff --git a/downloads-generation/models_class1_presentation/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_presentation/GENERATE.WITH_HPC_CLUSTER.sh new file mode 100755 index 0000000000000000000000000000000000000000..53125eb7bec329ecbbd0d230b8afe809c1064204 --- /dev/null +++ b/downloads-generation/models_class1_presentation/GENERATE.WITH_HPC_CLUSTER.sh @@ -0,0 +1 @@ +bash GENERATE.sh cluster diff --git a/downloads-generation/models_class1_presentation/GENERATE.sh b/downloads-generation/models_class1_presentation/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..29f27c866e7b9e6f180c6a71958c00975219dffa --- /dev/null +++ b/downloads-generation/models_class1_presentation/GENERATE.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# +# +# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete> +# +# cluster mode uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job +# scheduler). This would need to be modified for other sites. +# +set -e +set -x + +DOWNLOAD_NAME=models_class1_presentation +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +if [ "$1" != "cluster" ] +then + GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 + echo "Detected GPUS: $GPUS" + + PROCESSORS=$(getconf _NPROCESSORS_ONLN) + echo "Detected processors: $PROCESSORS" + + if [ "$GPUS" -eq "0" ]; then + NUM_JOBS=${NUM_JOBS-1} + else + NUM_JOBS=${NUM_JOBS-$GPUS} + fi + echo "Num jobs: $NUM_JOBS" + PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1" +else + PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf" +fi + +mkdir -p "$SCRATCH_DIR" +if [ "$2" != "continue-incomplete" ] +then + echo "Fresh run" + rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" + mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" +else + echo "Continuing incomplete run" +fi + +# Send stdout and stderr to a logfile included with the archive. +LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" +exec > >(tee -ia "$LOG") +exec 2> >(tee -ia "$LOG" >&2) + +# Log some environment info +echo "Invocation: $0 $@" +date +pip freeze +git status +mhcflurry-downloads info + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +export OMP_NUM_THREADS=1 +export PYTHONUNBUFFERED=1 + +if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ] +then + echo "Reusing existing expression-annotated hits data" +else + cp $SCRIPT_DIR/annotate_hits_with_expression.py . + time python annotate_hits_with_expression.py \ + --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \ + --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ + --out "$(pwd)/hits_with_tpm.csv" + bzip2 -f hits_with_tpm.csv +fi + +if [ "$2" == "continue-incomplete" ] && [ -f "train_data.csv.bz2" ] +then + echo "Reusing existing training data" +else + cp $SCRIPT_DIR/make_benchmark.py . + time python make_benchmark.py \ + --hits "$(pwd)/hits_with_tpm.csv.bz2" \ + --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ + --decoys-per-hit 99 \ + --exclude-pmid 31844290 31495665 31154438 \ + --out "$(pwd)/train_data.csv" + bzip2 -f train_data.csv +fi + +mhcflurry-class1-train-presentation-models \ + --data "$(pwd)/train_data.csv.bz2" \ + --affinity-predictor "$(mhcflurry-downloads path models_class1_pan)/models.combined" \ + --cleavage-predictor-with-flanking "$(mhcflurry-downloads path models_class1_cleavage)/models.selected" \ + --cleavage-predictor-without-flanking "$(mhcflurry-downloads path models_class1_cleavage_variants)/models.selected.no_flank" \ + --out-models-dir "$(pwd)/models" \ + --only-format MULTIALLELIC \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + $PARALLELISM_ARGS + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 -f "$LOG" +for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" diff --git a/downloads-generation/models_class1_presentation/annotate_hits_with_expression.py b/downloads-generation/models_class1_presentation/annotate_hits_with_expression.py new file mode 100644 index 0000000000000000000000000000000000000000..ca54da7244c7866aab24a280d7ada9989e32a353 --- /dev/null +++ b/downloads-generation/models_class1_presentation/annotate_hits_with_expression.py @@ -0,0 +1,68 @@ +""" +Annotate hits with expression (tpm), and roll up to just the highest-expressed +gene for each peptide. +""" +import sys +import argparse +import os +import numpy +import json +import collections +from six.moves import StringIO + +import pandas +import tqdm + +import mhcnames + + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--hits", + metavar="CSV", + required=True, + help="Multiallelic mass spec") +parser.add_argument( + "--expression", + metavar="CSV", + required=True, + help="Expression data") +parser.add_argument( + "--out", + metavar="CSV", + required=True, + help="File to write") + + +def run(): + args = parser.parse_args(sys.argv[1:]) + args.out = os.path.abspath(args.out) + + hit_df = pandas.read_csv(args.hits) + hit_df = hit_df.loc[ + (~hit_df.protein_ensembl.isnull()) + ] + print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) + expression_df = pandas.read_csv(args.expression, index_col=0).fillna(0) + + # Add a column to hit_df giving expression value for that sample and that gene + print("Annotating expression.") + hit_df["tpm"] = [ + expression_df.reindex( + row.protein_ensembl.split())[row.expression_dataset].sum() + for _, row in tqdm.tqdm( + hit_df.iterrows(), total=len(hit_df), ascii=True, maxinterval=10000) + ] + + # Discard hits except those that have max expression for each hit_id + print("Selecting max-expression transcripts for each hit.") + max_gene_hit_df = hit_df.loc[ + hit_df.tpm == hit_df.hit_id.map(hit_df.groupby("hit_id").tpm.max()) + ].sample(frac=1.0).drop_duplicates("hit_id") + + max_gene_hit_df.to_csv(args.out, index=False) + print("Wrote", args.out) + +if __name__ == '__main__': + run() diff --git a/downloads-generation/models_class1_presentation/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_presentation/cluster_submit_script_header.mssm_hpc.lsf new file mode 100644 index 0000000000000000000000000000000000000000..b1eec1a69a81d2c49a8feea9ec61c222eead5480 --- /dev/null +++ b/downloads-generation/models_class1_presentation/cluster_submit_script_header.mssm_hpc.lsf @@ -0,0 +1,44 @@ +#!/bin/bash +#BSUB -J MHCf-{work_item_num} # Job name +#BSUB -P acc_nkcancer # allocation account or Unix group +#BSUB -q gpu # queue +#BSUB -R rusage[ngpus_excl_p=1] # 1 exclusive GPU +#BSUB -R span[hosts=1] # one node +#BSUB -n 1 # number of compute cores +#BSUB -W 10:00 # walltime in HH:MM +#BSUB -R rusage[mem=20000] # mb memory requested +#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) +#BSUB -eo {work_dir}/STDERR # error log +#BSUB -L /bin/bash # Initialize the execution environment +# + +set -e +set -x + +echo "Subsequent stderr output redirected to stdout" >&2 +exec 2>&1 + +export TMPDIR=/local/JOBS/mhcflurry-{work_item_num} +export PATH=$HOME/.conda/envs/py36b/bin/:$PATH +export PYTHONUNBUFFERED=1 +export KMP_SETTINGS=1 + +free -m + +module add cuda/10.0.130 +module list + +export CUDNN_HOME=/hpc/users/odonnt02/oss/cudnn/cuda +export LD_LIBRARY_PATH=$CUDNN_HOME/lib64:$LD_LIBRARY_PATH +export CMAKE_LIBRARY_PATH=$CUDNN_HOME/lib64:$CMAKE_LIBRARY_PATH +export INCLUDE_PATH=$CUDNN_HOME/include:$INCLUDE_PATH +export C_INCLUDE_PATH=$CUDNN_HOME/include:$C_INCLUDE_PATH +export CPLUS_INCLUDE_PATH=$CUDNN_HOME/include:$CPLUS_INCLUDE_PATH +export CMAKE_INCLUDE_PATH=$CUDNN_HOME/include:$CMAKE_INCLUDE_PATH + +python -c 'import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")' + +env + +cd {work_dir} + diff --git a/downloads-generation/models_class1_presentation/generate_hyperparameters.py b/downloads-generation/models_class1_presentation/generate_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..890b814bc070962fa4664eb9eab57db29a00ed3e --- /dev/null +++ b/downloads-generation/models_class1_presentation/generate_hyperparameters.py @@ -0,0 +1,52 @@ +""" +Generate grid of hyperparameters +""" +from __future__ import print_function +from sys import stdout, stderr +from copy import deepcopy +from yaml import dump + +base_hyperparameters = dict( + convolutional_filters=64, + convolutional_kernel_size=8, + convolutional_kernel_l1_l2=(0.00, 0.0), + flanking_averages=True, + n_flank_length=15, + c_flank_length=15, + post_convolutional_dense_layer_sizes=[], + minibatch_size=512, + dropout_rate=0.5, + convolutional_activation="relu", + patience=20, + learning_rate=0.001) + +grid = [] + + +def hyperparrameters_grid(): + for learning_rate in [0.001]: + for convolutional_activation in ["tanh", "relu"]: + for convolutional_filters in [256, 512]: + for flanking_averages in [True]: + for convolutional_kernel_size in [11, 13, 15, 17]: + for l1 in [0.0, 1e-6]: + for s in [[8], [16]]: + for d in [0.3, 0.5]: + new = deepcopy(base_hyperparameters) + new["learning_rate"] = learning_rate + new["convolutional_activation"] = convolutional_activation + new["convolutional_filters"] = convolutional_filters + new["flanking_averages"] = flanking_averages + new["convolutional_kernel_size"] = convolutional_kernel_size + new["convolutional_kernel_l1_l2"] = (l1, 0.0) + new["post_convolutional_dense_layer_sizes"] = s + new["dropout_rate"] = d + yield new + + +for new in hyperparrameters_grid(): + if new not in grid: + grid.append(new) + +print("Hyperparameters grid size: %d" % len(grid), file=stderr) +dump(grid, stdout) diff --git a/downloads-generation/models_class1_presentation/make_benchmark.py b/downloads-generation/models_class1_presentation/make_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..532474a6abf55142700415dcf3864d7620abe363 --- /dev/null +++ b/downloads-generation/models_class1_presentation/make_benchmark.py @@ -0,0 +1,177 @@ +""" +Make training data by selecting decoys, etc. +""" +import sys +import argparse +import os +import numpy + +import pandas +import tqdm + +import mhcflurry + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--hits", + metavar="CSV", + required=True, + help="Multiallelic mass spec") +parser.add_argument( + "--proteome-peptides", + metavar="CSV", + required=True, + help="Proteome peptides") +parser.add_argument( + "--decoys-per-hit", + type=float, + metavar="N", + default=99, + help="Decoys per hit") +parser.add_argument( + "--exclude-pmid", + nargs="+", + default=[], + help="Exclude given PMID") +parser.add_argument( + "--only-pmid", + nargs="+", + default=[], + help="Include only the given PMID") +parser.add_argument( + "--only-format", + choices=("MONOALLELIC", "MULTIALLELIC"), + help="Include only data of the given format") +parser.add_argument( + "--out", + metavar="CSV", + required=True, + help="File to write") + + +def run(): + args = parser.parse_args(sys.argv[1:]) + hit_df = pandas.read_csv(args.hits) + numpy.testing.assert_equal(hit_df.hit_id.nunique(), len(hit_df)) + hit_df = hit_df.loc[ + (hit_df.mhc_class == "I") & + (hit_df.peptide.str.len() <= 11) & + (hit_df.peptide.str.len() >= 8) & + (~hit_df.protein_ensembl.isnull()) & + (hit_df.peptide.str.match("^[%s]+$" % "".join( + mhcflurry.amino_acid.COMMON_AMINO_ACIDS))) + ] + print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) + if args.only_format: + hit_df = hit_df.loc[hit_df.format == args.only_format].copy() + print("Subselected to %d %s samples" % ( + hit_df.sample_id.nunique(), args.only_format)) + + hit_df["pmid"] = hit_df["pmid"].astype(str) + + if args.only_pmid or args.exclude_pmid: + assert not (args.only_pmid and args.exclude_pmid) + + pmids = list(args.only_pmid) + list(args.exclude_pmid) + hit_pmids = hit_df.pmid.unique() + missing = [pmid for pmid in pmids if pmid not in hit_pmids] + assert not missing, missing + + mask = hit_df.pmid.isin(pmids) + if args.exclude_pmid: + mask = ~mask + + new_hit_df = hit_df.loc[mask] + print( + "Selecting by pmids", + pmids, + "reduced dataset from", + len(hit_df), + "to", + len(new_hit_df)) + hit_df = new_hit_df.copy() + print("Subselected by pmid to %d samples" % hit_df.sample_id.nunique()) + + sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") + grouped = hit_df.groupby("sample_id").nunique() + for col in sample_table.columns: + if (grouped[col] > 1).any(): + del sample_table[col] + + print("Loading proteome peptides") + all_peptides_df = pandas.read_csv(args.proteome_peptides) + print("Loaded: ", all_peptides_df.shape) + + all_peptides_df = all_peptides_df.loc[ + all_peptides_df.protein_accession.isin(hit_df.protein_accession.unique()) & + all_peptides_df.peptide.str.match("^[%s]+$" % "".join( + mhcflurry.amino_acid.COMMON_AMINO_ACIDS)) + ].copy() + all_peptides_df["length"] = all_peptides_df.peptide.str.len() + print("Subselected proteome peptides by accession: ", all_peptides_df.shape) + + all_peptides_by_length = dict(iter(all_peptides_df.groupby("length"))) + + columns_to_keep = [ + "hit_id", + "protein_accession", + "n_flank", + "c_flank", + "peptide", + "sample_id", + "affinity_prediction", + "hit", + ] + + print("Selecting decoys.") + + lengths = [8, 9, 10, 11] + result_df = [] + + for sample_id, sub_df in tqdm.tqdm( + hit_df.loc[ + (hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8) + ].groupby("sample_id"), total=hit_df.sample_id.nunique()): + result_df.append( + sub_df[[ + "protein_accession", + "peptide", + "sample_id", + "n_flank", + "c_flank", + ]].copy()) + result_df[-1]["hit"] = 1 + for length in lengths: + universe = all_peptides_by_length[length] + result_df.append(universe.loc[ + (~universe.peptide.isin(sub_df.peptide.unique())) & ( + universe.protein_accession.isin( + sub_df.protein_accession.unique()))].sample( + n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))[ + ["protein_accession", "peptide", "n_flank", "c_flank"]]) + result_df[-1]["hit"] = 0 + result_df[-1]["sample_id"] = sample_id + + result_df = pandas.concat(result_df, ignore_index=True, sort=False) + result_df["hla"] = result_df.sample_id.map(sample_table.hla) + + print(result_df) + print("Counts:") + print(result_df.groupby(["sample_id", "hit"]).peptide.nunique()) + + print("Hit counts:") + print( + result_df.loc[ + result_df.hit == 1 + ].groupby("sample_id").hit.count().sort_values()) + + print("Hit rates:") + print(result_df.groupby("sample_id").hit.mean().sort_values()) + + result_df.to_csv(args.out, index=False) + print("Wrote: ", args.out) + + +if __name__ == '__main__': + run() diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 525ce6a1c21feaab8ede3099a8764c1ab11623c6..d15d818024984274c40477b16a5c5854fc277e3d 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -4,8 +4,8 @@ Class I MHC ligand prediction package from .class1_affinity_predictor import Class1AffinityPredictor from .class1_neural_network import Class1NeuralNetwork +from .class1_cleavage_predictor import Class1CleavagePredictor from .class1_presentation_predictor import Class1PresentationPredictor -from .class1_presentation_neural_network import Class1PresentationNeuralNetwork from .version import __version__ @@ -13,6 +13,6 @@ __all__ = [ "__version__", "Class1AffinityPredictor", "Class1NeuralNetwork", + "Class1CleavagePredictor", "Class1PresentationPredictor", - "Class1PresentationNeuralNetwork", ] diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index 4feb41a82c9be97d619c9950613cfa1722cf75f8..32a4d0369125c36aa09416c5358fc7f8106847b3 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -1,8 +1,5 @@ -import numpy import pandas -from copy import copy - from . import amino_acid @@ -145,88 +142,3 @@ class AlleleEncoding(object): result = vector_encoded[self.indices] self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] - - -class MultipleAlleleEncoding(object): - def __init__( - self, - experiment_names=[], - experiment_to_allele_list={}, - max_alleles_per_experiment=6, - allele_to_sequence=None, - borrow_from=None): - - padded_experiment_to_allele_list = {} - for (name, alleles) in experiment_to_allele_list.items(): - assert len(alleles) > 0 - assert len(alleles) <= max_alleles_per_experiment - alleles_with_mask = alleles + [None] * ( - max_alleles_per_experiment - len(alleles)) - padded_experiment_to_allele_list[name] = alleles_with_mask - - flattened_allele_list = [] - for name in experiment_names: - flattened_allele_list.extend(padded_experiment_to_allele_list[name]) - - self.allele_encoding = AlleleEncoding( - alleles=flattened_allele_list, - allele_to_sequence=allele_to_sequence, - borrow_from=borrow_from - ) - self.max_alleles_per_experiment = max_alleles_per_experiment - self.experiment_names = numpy.array(experiment_names) - - def append_alleles(self, alleles): - extended_alleles = list(self.allele_encoding.alleles) - for allele in alleles: - extended_alleles.append(allele) - extended_alleles.extend( - [None] * (self.max_alleles_per_experiment - 1)) - - assert len(extended_alleles) % self.max_alleles_per_experiment == 0, ( - len(extended_alleles)) - - self.allele_encoding = AlleleEncoding( - alleles=extended_alleles, - borrow_from=self.allele_encoding) - - self.experiment_names = numpy.concatenate([ - self.experiment_names, - numpy.tile(None, len(alleles)) - ]) - - @property - def indices(self): - return self.allele_encoding.indices.values.reshape( - (-1, self.max_alleles_per_experiment)) - - @property - def alleles(self): - return numpy.reshape( - self.allele_encoding.alleles.values, - (-1, self.max_alleles_per_experiment)) - - def compact(self): - result = copy(self) - result.allele_encoding = self.allele_encoding.compact() - return result - - def allele_representations(self, encoding_name): - return self.allele_encoding.allele_representations(encoding_name) - - @property - def allele_to_sequence(self): - return self.allele_encoding.allele_to_sequence - - def fixed_length_vector_encoded_sequences(self, encoding_name): - raise NotImplementedError() - - def shuffle_in_place(self, shuffle_permutation=None): - alleles_matrix = self.alleles - if shuffle_permutation is None: - shuffle_permutation = numpy.random.permutation(len(alleles_matrix)) - self.allele_encoding = AlleleEncoding( - alleles=alleles_matrix[shuffle_permutation].flatten(), - borrow_from=self.allele_encoding - ) - self.experiment_names = self.experiment_names[shuffle_permutation] \ No newline at end of file diff --git a/mhcflurry/class1_presentation_neural_network.py b/mhcflurry/class1_presentation_neural_network.py deleted file mode 100644 index 531fc0a0302bdcc145d4fc7b3d870c3f0db03fa0..0000000000000000000000000000000000000000 --- a/mhcflurry/class1_presentation_neural_network.py +++ /dev/null @@ -1,667 +0,0 @@ -from __future__ import print_function - -import time -import collections -from six import string_types - -import numpy -import pandas -import mhcnames -import hashlib -from copy import copy - -from .hyperparameters import HyperparameterDefaults -from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE -from .class1_cleavage_neural_network import Class1CleavageNeuralNetwork -from .encodable_sequences import EncodableSequences -from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding -from .auxiliary_input import AuxiliaryInputEncoder -from .flanking_encoding import FlankingEncoding - - -def combine_layer_kernel_initializer(shape, dtype=None): - import keras.backend as K - return K.constant([[1.0, 1e-3, 1e-4]], dtype=dtype, shape=shape) - -class Class1PresentationNeuralNetwork(object): - network_hyperparameter_defaults = HyperparameterDefaults( - max_alleles=6, - ) - """ - Hyperparameters (and their default values) that affect the neural network - architecture. - """ - - fit_hyperparameter_defaults = HyperparameterDefaults( - trainable_cleavage_predictor=False, - trainable_affinity_predictor=False, - max_epochs=500, - validation_split=0.1, - early_stopping=True, - minibatch_size=256, - ) - """ - Hyperparameters for neural network training. - """ - - early_stopping_hyperparameter_defaults = HyperparameterDefaults( - patience=20, - min_delta=0.0, - ) - """ - Hyperparameters for early stopping. - """ - - compile_hyperparameter_defaults = HyperparameterDefaults( - optimizer="adam", - learning_rate=None, - ) - """ - Optimizer hyperparameters. Any values supported by keras may be used. - """ - - auxiliary_input_hyperparameter_defaults = HyperparameterDefaults( - auxiliary_input_features=["gene"], - auxiliary_input_feature_parameters={}, - include_cleavage=True, - ) - """ - Allele feature hyperparameters. - """ - - hyperparameter_defaults = network_hyperparameter_defaults.extend( - fit_hyperparameter_defaults).extend( - early_stopping_hyperparameter_defaults).extend( - compile_hyperparameter_defaults).extend( - auxiliary_input_hyperparameter_defaults) - - def __init__(self, **hyperparameters): - self.hyperparameters = self.hyperparameter_defaults.with_defaults( - hyperparameters) - self.network = None - self.fit_info = [] - self.allele_representation_hash = None - self.affinity_model = None - self.cleavage_model = None - - def build(self, affinity_model, cleavage_model=None): - import keras.backend as K - import keras.models - - assert isinstance(affinity_model, Class1NeuralNetwork), affinity_model - affinity_model = copy(affinity_model) - - self.affinity_model = affinity_model - affinity_network = affinity_model.network() - - model_inputs = {} - - peptide_shape = tuple( - int(x) for x in K.int_shape(affinity_network.inputs[0])[1:]) - - model_inputs['allele_set'] = keras.layers.Input( - shape=(self.hyperparameters['max_alleles'],), name="allele_set") - model_inputs['peptide'] = keras.layers.Input( - shape=peptide_shape, - dtype='float32', - name='peptide') - - peptides_flattened = keras.layers.Flatten()(model_inputs['peptide']) - peptides_repeated = keras.layers.RepeatVector( - self.hyperparameters['max_alleles'])( - peptides_flattened) - - allele_representation = keras.layers.Embedding( - name="allele_representation", - input_dim=64, # arbitrary, how many alleles to have room for - output_dim=affinity_network.get_layer( - "allele_representation").output_shape[-1], - input_length=self.hyperparameters['max_alleles'], - trainable=False, - mask_zero=False)(model_inputs['allele_set']) - - allele_flat = allele_representation - - allele_peptide_merged = keras.layers.concatenate( - [peptides_repeated, allele_flat], name="allele_peptide_merged") - - layer_names = [ - layer.name for layer in affinity_network.layers - ] - - pan_allele_layer_initial_names = [ - 'allele', 'peptide', - 'allele_representation', 'flattened_0', 'allele_flat', - 'allele_peptide_merged', 'dense_0', 'dropout_0', - ] - - def startswith(lst, prefix): - return lst[:len(prefix)] == prefix - - assert startswith( - layer_names, pan_allele_layer_initial_names), layer_names - - layers = affinity_network.layers[ - pan_allele_layer_initial_names.index( - "allele_peptide_merged") + 1: - ] - node = allele_peptide_merged - affinity_predictor_layer_name_to_new_node = { - "allele_peptide_merged": allele_peptide_merged, - } - - for layer in layers: - assert layer.name not in affinity_predictor_layer_name_to_new_node - input_layer_names = [] - for inbound_node in layer._inbound_nodes: - for inbound_layer in inbound_node.inbound_layers: - input_layer_names.append(inbound_layer.name) - input_nodes = [ - affinity_predictor_layer_name_to_new_node[name] - for name in input_layer_names - ] - - if len(input_nodes) == 1: - lifted = keras.layers.TimeDistributed(layer, name=layer.name) - node = lifted(input_nodes[0]) - else: - node = layer(input_nodes) - affinity_predictor_layer_name_to_new_node[layer.name] = node - - def logit(x): - import tensorflow as tf - return -tf.log(1. / x - 1.) - - node = keras.layers.Lambda(logit, name="logit")(node) - affinity_prediction_and_other_signals = [node] - if self.hyperparameters['include_cleavage']: - assert isinstance(cleavage_model, Class1CleavageNeuralNetwork) - cleavage_model = copy(cleavage_model) - self.cleavage_model = cleavage_model - cleavage_network = cleavage_model.network() - - model_inputs['sequence'] = keras.layers.Input( - shape=cleavage_network.get_layer("sequence").output_shape[1:], - dtype='float32', - name='sequence') - model_inputs['peptide_length'] = keras.layers.Input( - shape=(1,), - dtype='int32', - name='peptide_length') - cleavage_network.name = "cleavage_predictor" - cleavage_prediction = cleavage_network([ - model_inputs['peptide_length'], - model_inputs['sequence'], - ]) - cleavage_prediction.trainable = False - cleavage_prediction_repeated = keras.layers.RepeatVector( - self.hyperparameters['max_alleles'])(cleavage_prediction) - affinity_prediction_and_other_signals.append( - cleavage_prediction_repeated) - - if self.hyperparameters['auxiliary_input_features']: - model_inputs['auxiliary'] = keras.layers.Input( - shape=( - self.hyperparameters['max_alleles'], - len( - AuxiliaryInputEncoder.get_columns( - self.hyperparameters['auxiliary_input_features'], - feature_parameters=self.hyperparameters[ - 'auxiliary_input_feature_parameters']))), - dtype="float32", - name="auxiliary") - affinity_prediction_and_other_signals.append( - model_inputs['auxiliary']) - - if len(affinity_prediction_and_other_signals) > 1: - node = keras.layers.concatenate( - affinity_prediction_and_other_signals, - name="affinity_prediction_and_other_signals") - - def show(x): - import tensorflow as tf - return x - #return tf.Print(x, [x], summarize=100, message="combine_input") - - node = keras.layers.Lambda(show)(node) - - layer = keras.layers.Dense( - 1, - activation="sigmoid", - kernel_initializer=combine_layer_kernel_initializer, - name="combine") - lifted = keras.layers.TimeDistributed(layer, name="per_allele_output") - node = lifted(node) - else: - (node,) = affinity_prediction_and_other_signals - - # Apply allele mask: zero out all outputs corresponding to alleles - # with the special index 0. - def alleles_to_mask(x): - import keras.backend as K - result = K.expand_dims( - K.cast(K.not_equal(x, 0), "float32"), axis=-1) - return result - - allele_mask = keras.layers.Lambda( - alleles_to_mask, name="allele_mask")(model_inputs['allele_set']) - - node = keras.layers.Multiply( - name="masked_per_allele_outputs")( - [allele_mask, node]) - - presentation_output = keras.layers.Reshape( - target_shape=(self.hyperparameters['max_alleles'],))( - node) - - self.network = keras.models.Model( - inputs=list(model_inputs.values()), - outputs=presentation_output, - name="presentation", - ) - - if not self.hyperparameters['trainable_cleavage_predictor']: - if self.hyperparameters['include_cleavage']: - self.network.get_layer("cleavage_predictor").trainable = False - - self.affinity_predictor_layer_names = list( - affinity_predictor_layer_name_to_new_node) - - self.set_trainable( - trainable_affinity_predictor=( - self.hyperparameters['trainable_affinity_predictor'])) - - def set_trainable(self, trainable_affinity_predictor=None): - if trainable_affinity_predictor is not None: - for name in self.affinity_predictor_layer_names: - self.network.get_layer(name).trainable = trainable_affinity_predictor - - - @staticmethod - def loss(y_true, y_pred): - # Binary cross entropy. - # We take the weighted average of the per-allele predictions, where the - # weighting is the softmax of the predictions. - from keras import backend as K - import tensorflow as tf - - y_pred = K.constant(y_pred) if not K.is_tensor(y_pred) else y_pred - y_true = K.cast(y_true, y_pred.dtype) - - logit_y_pred = -tf.log(1. / y_pred - 1.) - #logit_y_pred = tf.Print(logit_y_pred, [logit_y_pred], message="logit_y_pred", summarize=50) - - softmax = K.softmax(5 * logit_y_pred, axis=-1) - #softmax = tf.Print(softmax, [softmax], message="softmax", summarize=50) - - product = softmax * y_pred - #product = tf.Print(product, [product], message="product", summarize=50) - - result = tf.reduce_sum(product, axis=-1) - #result = tf.Print(result, [result], message="result", summarize=50) - - # Alternative to all of the above: - #result = tf.reduce_max(y_pred, axis=-1) - - return K.mean( - K.binary_crossentropy(y_true, result), - axis=-1) - - def network_input( - self, peptides, allele_encoding, flanking_encoding=None): - """ - - Parameters - ---------- - peptides : EncodableSequences or list of string - - allele_encoding : AlleleEncoding - - flanking_encoding: Flank - - - Returns - ------- - numpy.array - """ - assert self.affinity_model is not None - - (allele_input, allele_representations) = ( - self.affinity_model.allele_encoding_to_network_input( - allele_encoding)) - peptides = EncodableSequences.create(peptides) - x_dict = { - 'peptide': self.affinity_model.peptides_to_network_input(peptides), - 'allele_set': allele_input, - } - if self.hyperparameters['include_cleavage']: - assert self.cleavage_model is not None - numpy.testing.assert_array_equal( - peptides.sequences, - flanking_encoding.dataframe.peptide.values) - if flanking_encoding is None: - raise RuntimeError("flanking_encoding required") - cleavage_x_dict = self.cleavage_model.network_input( - flanking_encoding) - x_dict.update(cleavage_x_dict) - if self.hyperparameters['auxiliary_input_features']: - auxiliary_encoder = AuxiliaryInputEncoder( - alleles=allele_encoding.alleles, - peptides=peptides.sequences) - x_dict[ - 'auxiliary' - ] = auxiliary_encoder.get_array( - features=self.hyperparameters['auxiliary_input_features'], - feature_parameters=self.hyperparameters[ - 'auxiliary_input_feature_parameters']) - return (x_dict, allele_representations) - - def fit( - self, - targets, - peptides, - allele_encoding, - flanking_encoding=None, - sample_weights=None, - shuffle_permutation=None, - verbose=1, - progress_callback=None, - progress_preamble="", - progress_print_interval=5.0): - - import keras.backend as K - - assert isinstance(allele_encoding, MultipleAlleleEncoding) - assert ( - allele_encoding.max_alleles_per_experiment == - self.hyperparameters['max_alleles']) - - (x_dict, allele_representations) = ( - self.network_input( - peptides=peptides, - allele_encoding=allele_encoding, - flanking_encoding=flanking_encoding)) - - # Shuffle - if shuffle_permutation is None: - shuffle_permutation = numpy.random.permutation(len(targets)) - targets = numpy.array(targets)[shuffle_permutation] - assert numpy.isnan(targets).sum() == 0, targets - if sample_weights is not None: - sample_weights = numpy.array(sample_weights)[shuffle_permutation] - for key in list(x_dict): - x_dict[key] = x_dict[key][shuffle_permutation] - del peptides - del allele_encoding - del flanking_encoding - - fit_info = collections.defaultdict(list) - - allele_representations_hash = self.set_allele_representations( - allele_representations) - - self.network.compile( - loss=self.loss, - optimizer=self.hyperparameters['optimizer']) - if self.hyperparameters['learning_rate'] is not None: - K.set_value( - self.network.optimizer.lr, - self.hyperparameters['learning_rate']) - fit_info["learning_rate"] = float(K.get_value(self.network.optimizer.lr)) - - if verbose: - self.network.summary() - - training_start = time.time() - - min_val_loss_iteration = None - min_val_loss = None - last_progress_print = 0 - for i in range(self.hyperparameters['max_epochs']): - epoch_start = time.time() - self.assert_allele_representations_hash(allele_representations_hash) - fit_history = self.network.fit( - x_dict, - targets, - validation_split=self.hyperparameters['validation_split'], - batch_size=self.hyperparameters['minibatch_size'], - epochs=i + 1, - sample_weight=sample_weights, - initial_epoch=i, - verbose=verbose) - epoch_time = time.time() - epoch_start - - for (key, value) in fit_history.history.items(): - fit_info[key].extend(value) - - if numpy.isnan(fit_info['loss'][-1]): - raise ValueError("NaN loss") - - # Print progress no more often than once every few seconds. - if progress_print_interval is not None and ( - not last_progress_print or ( - time.time() - last_progress_print - > progress_print_interval)): - print((progress_preamble + " " + - "Epoch %3d / %3d [%0.2f sec]: loss=%g val_loss=%g. " - "Min val loss (%s) at epoch %s" % ( - i, - self.hyperparameters['max_epochs'], - epoch_time, - fit_info['loss'][-1], - ( - fit_info['val_loss'][-1] - if 'val_loss' in fit_info else numpy.nan - ), - str(min_val_loss), - min_val_loss_iteration)).strip()) - last_progress_print = time.time() - - if self.hyperparameters['validation_split']: - val_loss = fit_info['val_loss'][-1] - - if min_val_loss is None or ( - val_loss < min_val_loss - self.hyperparameters['min_delta']): - min_val_loss = val_loss - min_val_loss_iteration = i - - if self.hyperparameters['early_stopping']: - threshold = ( - min_val_loss_iteration + - self.hyperparameters['patience']) - if i > threshold: - if progress_print_interval is not None: - print((progress_preamble + " " + - "Stopping at epoch %3d / %3d: loss=%g. " - "Min val loss (%g) at epoch %s" % ( - i, - self.hyperparameters['max_epochs'], - fit_info['loss'][-1], - ( - min_val_loss if min_val_loss is not None - else numpy.nan), - min_val_loss_iteration)).strip()) - break - - if progress_callback: - progress_callback() - - fit_info["time"] = time.time() - training_start - fit_info["num_points"] = len(targets) - self.fit_info.append(dict(fit_info)) - - def predict( - self, - peptides, - allele_encoding, - flanking_encoding=None, - batch_size=DEFAULT_PREDICT_BATCH_SIZE): - - peptides = EncodableSequences.create(peptides) - assert isinstance(allele_encoding, MultipleAlleleEncoding) - - (x_dict, allele_representations) = self.network_input( - peptides=peptides, - allele_encoding=allele_encoding, - flanking_encoding=flanking_encoding) - - self.set_allele_representations(allele_representations) - raw_predictions = self.network.predict(x_dict, batch_size=batch_size) - return raw_predictions - - def clear_allele_representations(self): - """ - Set allele representations to an empty array. Useful before saving to - save a smaller version of the model. - """ - layer = self.network.get_layer("allele_representation") - existing_weights_shape = (layer.input_dim, layer.output_dim) - self.set_allele_representations( - numpy.zeros(shape=(0,) + existing_weights_shape[1:]), - force_surgery=True) - - def set_allele_representations(self, allele_representations, force_surgery=False): - """ - """ - from keras.models import clone_model - import keras.backend as K - import tensorflow as tf - - reshaped = allele_representations.reshape(( - allele_representations.shape[0], - numpy.product(allele_representations.shape[1:]))) - original_model = self.network - - layer = original_model.get_layer("allele_representation") - existing_weights_shape = (layer.input_dim, layer.output_dim) - - # Only changes to the number of supported alleles (not the length of - # the allele sequences) are allowed. - assert existing_weights_shape[1:] == reshaped.shape[1:] - - if existing_weights_shape[0] > reshaped.shape[0] and not force_surgery: - # Extend with NaNs so we can avoid having to reshape the weights - # matrix, which is expensive. - reshaped = numpy.append( - reshaped, - numpy.ones([ - existing_weights_shape[0] - reshaped.shape[0], - reshaped.shape[1] - ]) * numpy.nan, - axis=0) - - if existing_weights_shape != reshaped.shape: - print( - "Performing network surgery", existing_weights_shape, reshaped.shape) - # Network surgery required. Make a new network with this layer's - # dimensions changed. Kind of a hack. - layer.input_dim = reshaped.shape[0] - new_model = clone_model(original_model) - - # copy weights for other layers over - for layer in new_model.layers: - if layer.name != "allele_representation": - layer.set_weights( - original_model.get_layer(name=layer.name).get_weights()) - - self.network = new_model - - layer = new_model.get_layer("allele_representation") - - # Disable the old model to catch bugs. - def throw(*args, **kwargs): - raise RuntimeError("Using a disabled model!") - original_model.predict = \ - original_model.fit = \ - original_model.fit_generator = throw - - layer.set_weights([reshaped]) - self.allele_representation_hash = hashlib.sha1( - allele_representations.tobytes()).hexdigest() - return self.allele_representation_hash - - def assert_allele_representations_hash(self, value): - numpy.testing.assert_equal(self.allele_representation_hash, value) - - def __getstate__(self): - """ - serialize to a dict. Model weights are included. For pickle support. - - Returns - ------- - dict - - """ - result = self.get_config() - result['network_weights'] = self.get_weights() - return result - - def __setstate__(self, state): - """ - Deserialize. For pickle support. - """ - network_json = state.pop("network_json") - network_weights = state.pop("network_weights") - self.__dict__.update(state) - if network_json is not None: - import keras.models - self.network = keras.models.model_from_json(network_json) - if network_weights is not None: - self.network.set_weights(network_weights) - - def get_weights(self): - """ - Get the network weights - - Returns - ------- - list of numpy.array giving weights for each layer or None if there is no - network - """ - if self.network is None: - return None - return self.network.get_weights() - - def get_config(self): - """ - serialize to a dict all attributes except model weights - - Returns - ------- - dict - """ - result = dict(self.__dict__) - del result['network'] - result['network_json'] = None - if self.network: - result['network_json'] = self.network.to_json() - return result - - @classmethod - def from_config(cls, config, weights=None): - """ - deserialize from a dict returned by get_config(). - - Parameters - ---------- - config : dict - weights : list of array, optional - Network weights to restore - weights_loader : callable, optional - Function to call (no arguments) to load weights when needed - - Returns - ------- - Class1NeuralNetwork - """ - config = dict(config) - instance = cls(**config.pop('hyperparameters')) - network_json = config.pop('network_json') - instance.__dict__.update(config) - assert instance.network is None - if network_json is not None: - import keras.models - instance.network = keras.models.model_from_json(network_json) - if weights is not None: - instance.network.set_weights(weights) - return instance diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index fcea50530d0b5b83f8ad4679abc74d99607dbfd1..e1181a155134fe69a62513b4e49bca0cf63723eb 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -12,133 +12,226 @@ import hashlib import logging from six import string_types - import numpy import pandas +import sklearn +import sklearn.linear_model import mhcnames +try: + import tqdm +except ImportError: + tdqm = None + from .version import __version__ -from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE +from .class1_affinity_predictor import Class1AffinityPredictor +from .class1_cleavage_predictor import Class1CleavagePredictor +from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE from .encodable_sequences import EncodableSequences from .regression_target import from_ic50, to_ic50 -from .allele_encoding import MultipleAlleleEncoding +from .multiple_allele_encoding import MultipleAlleleEncoding from .downloads import get_default_class1_presentation_models_dir -from .class1_presentation_neural_network import Class1PresentationNeuralNetwork -from .common import save_weights, load_weights, NumpyJSONEncoder -from .flanking_encoding import FlankingEncoding +from .common import load_weights + + +MAX_ALLELES_PER_SAMPLE = 6 +PREDICT_BATCH_SIZE = DEFAULT_PREDICT_BATCH_SIZE class Class1PresentationPredictor(object): + model_inputs = ["affinity_score", "cleavage_prediction"] + def __init__( self, - models, - allele_to_sequence, - manifest_df=None, + affinity_predictor=None, + cleavage_predictor_with_flanks=None, + cleavage_predictor_without_flanks=None, + weights_dataframe=None, metadata_dataframes=None): - self.models = models - self.allele_to_sequence = allele_to_sequence - self._manifest_df = manifest_df + + self.affinity_predictor = affinity_predictor + self.cleavage_predictor_with_flanks = cleavage_predictor_with_flanks + self.cleavage_predictor_without_flanks = cleavage_predictor_without_flanks + self.weights_dataframe = weights_dataframe self.metadata_dataframes = ( dict(metadata_dataframes) if metadata_dataframes else {}) + self._models_cache = {} + + def get_affinity_predictions( + self, peptides, experiment_names, alleles, verbose=1): + + df = pandas.DataFrame({ + "peptide": numpy.array(peptides, copy=False), + "experiment_name": numpy.array(experiment_names, copy=False), + }) + + iterator = df.groupby("experiment_name") + if verbose > 0: + print("Predicting affinities.") + if tqdm is not None: + iterator = tqdm.tqdm( + iterator, total=df.experiment_name.nunique()) + + for (experiment, sub_df) in iterator: + predictions_df = pandas.DataFrame(index=sub_df.index) + experiment_peptides = EncodableSequences.create(sub_df.peptide.values) + for allele in alleles[experiment]: + predictions_df[allele] = self.affinity_predictor.predict( + peptides=experiment_peptides, + allele=allele, + model_kwargs={'batch_size': PREDICT_BATCH_SIZE}) + df.loc[ + sub_df.index, "tightest_affinity" + ] = predictions_df.min(1).values + df.loc[ + sub_df.index, "tightest_affinity_allele" + ] = predictions_df.idxmin(1).values + + return df + + def get_cleavage_predictions( + self, peptides, n_flanks=None, c_flanks=None, verbose=1): + + if verbose > 0: + print("Predicting cleavage.") + + if (n_flanks is None) != (c_flanks is None): + raise ValueError("Specify both or neither of n_flanks, c_flanks") + + if n_flanks is None: + if self.cleavage_predictor_without_flanks is None: + raise ValueError("No cleavage predictor without flanks") + predictor = self.cleavage_predictor_without_flanks + n_flanks = [""] * len(peptides) + c_flanks = n_flanks + else: + if self.cleavage_predictor_with_flanks is None: + raise ValueError("No cleavage predictor with flanks") + predictor = self.cleavage_predictor_with_flanks + + result = predictor.predict( + peptides=peptides, + n_flanks=n_flanks, + c_flanks=c_flanks, + batch_size=PREDICT_BATCH_SIZE) - @property - def manifest_df(self): - """ - A pandas.DataFrame describing the models included in this predictor. - - Returns - ------- - pandas.DataFrame - """ - if self._manifest_df is None: - rows = [] - for (i, model) in enumerate(self.models): - model_config = model.get_config() - rows.append(( - self.model_name(i), - json.dumps(model_config, cls=NumpyJSONEncoder), - model - )) - self._manifest_df = pandas.DataFrame( - rows, - columns=["model_name", "config_json", "model"]) - return self._manifest_df - - @property - def max_alleles(self): - max_alleles = self.models[0].hyperparameters['max_alleles'] - assert all( - n.hyperparameters['max_alleles'] == max_alleles - for n in self.models) - return max_alleles - - @staticmethod - def model_name(num): - """ - Generate a model name - - Returns - ------- - string - - """ - random_string = hashlib.sha1( - str(time.time()).encode()).hexdigest()[:16] - return "LIGANDOME-CLASSI-%d-%s" % ( - num, - random_string) - - @staticmethod - def weights_path(models_dir, model_name): - """ - Generate the path to the weights file for a model + return result - Parameters - ---------- - models_dir : string - model_name : string + def fit( + self, + targets, + peptides, + experiment_names, + alleles, + n_flanks=None, + c_flanks=None, + verbose=1): - Returns - ------- - string - """ - return join(models_dir, "weights_%s.npz" % model_name) + df = self.get_affinity_predictions( + peptides=peptides, + experiment_names=experiment_names, + alleles=alleles, + verbose=verbose) + df["affinity_score"] = from_ic50(df.tightest_affinity) + df["target"] = numpy.array(targets, copy=False) + + if (n_flanks is None) != (c_flanks is None): + raise ValueError("Specify both or neither of n_flanks, c_flanks") + + with_flanks_list = [] + if self.cleavage_predictor_without_flanks is not None: + with_flanks_list.append(False) + + if n_flanks is not None and self.cleavage_predictor_with_flanks is not None: + with_flanks_list.append(True) + + if not with_flanks_list: + raise RuntimeError("Can't fit any models") + + if self.weights_dataframe is None: + self.weights_dataframe = pandas.DataFrame() + + for with_flanks in with_flanks_list: + model_name = 'with_flanks' if with_flanks else "without_flanks" + if verbose > 0: + print("Training variant", model_name) + + df["cleavage_prediction"] = self.get_cleavage_predictions( + peptides=df.peptide.values, + n_flanks=n_flanks if with_flanks else None, + c_flanks=c_flanks if with_flanks else None, + verbose=verbose) + + model = self.get_model() + if verbose > 0: + print("Fitting LR model.") + print(df) + + model.fit( + X=df[self.model_inputs].values, + y=df.target.astype(float)) + + self.weights_dataframe.loc[model_name, "intercept"] = model.intercept_ + for (name, value) in zip(self.model_inputs, numpy.squeeze(model.coef_)): + self.weights_dataframe.loc[model_name, name] = value + self._models_cache[model_name] = model + + def get_model(self, name=None): + if name is None or name not in self._models_cache: + model = sklearn.linear_model.LogisticRegression(solver="lbfgs") + if name is not None: + row = self.weights_dataframe.loc[name] + model.intercept_ = row.intercept + model.coef_ = numpy.expand_dims( + row[self.model_inputs].values, axis=0) + else: + model = self._models_cache[name] + return model + + def predict_sequences(self, alleles, sequences): + raise NotImplementedError def predict( self, peptides, alleles, + experiment_names=None, n_flanks=None, c_flanks=None, - batch_size=DEFAULT_PREDICT_BATCH_SIZE): + verbose=1): return self.predict_to_dataframe( peptides=peptides, alleles=alleles, + experiment_names=experiment_names, n_flanks=n_flanks, c_flanks=c_flanks, - batch_size=batch_size).score.values + verbose=verbose).score.values def predict_to_dataframe( self, peptides, alleles, + experiment_names=None, n_flanks=None, c_flanks=None, - flanking_encoding=None, - include_details=False, - batch_size=DEFAULT_PREDICT_BATCH_SIZE): + verbose=1): if isinstance(peptides, string_types): - raise TypeError("peptides must be a list or array, not a string") + raise TypeError("peptides must be a list not a string") if isinstance(alleles, string_types): - raise TypeError( - "alleles must be an iterable or MultipleAlleleEncoding") + raise TypeError("alleles must be a list or dict") - peptides = EncodableSequences.create(peptides) - - if not isinstance(alleles, MultipleAlleleEncoding): - if len(alleles) > self.max_alleles: + if isinstance(alleles, dict): + if experiment_names is None: + raise ValueError( + "experiment_names must be supplied when alleles is a dict") + else: + if experiment_names is not None: + raise ValueError( + "alleles must be a dict when experiment_names is specified") + alleles = numpy.array(alleles, copy=False) + if len(alleles) > MAX_ALLELES_PER_SAMPLE: raise ValueError( "When alleles is a list, it must have at most %d elements. " "These alleles are taken to be a genotype for an " @@ -146,151 +239,79 @@ class Class1PresentationPredictor(object): "will be taken for each peptide. Note that this differs " "from Class1AffinityPredictor.predict(), where alleles " "is expected to be the same length as peptides." - % ( - self.max_alleles)) - alleles = MultipleAlleleEncoding( - experiment_names=numpy.tile("experiment", len(peptides)), - experiment_to_allele_list={ - "experiment": [ - mhcnames.normalize_allele_name(a) for a in alleles - ], - }, - allele_to_sequence=self.allele_to_sequence, - max_alleles_per_experiment=self.max_alleles) - - if n_flanks is not None: - if flanking_encoding is not None: - raise ValueError( - "Specify either n_flanks/c_flanks or flanking_encoding, not" - "both.") - if c_flanks is None: - raise ValueError("Both flanks required") - flanking_encoding = FlankingEncoding( - peptides=peptides.sequences, - n_flanks=n_flanks, - c_flanks=c_flanks) - - score_array = [] - - for (i, network) in enumerate(self.models): - predictions = network.predict( - peptides=peptides, - allele_encoding=alleles, - flanking_encoding=flanking_encoding, - batch_size=batch_size) - score_array.append(predictions) - - score_array = numpy.array(score_array) - - ensemble_scores = numpy.mean(score_array, axis=0) - top_allele_index = numpy.argmax(ensemble_scores, axis=-1) - top_allele_flat_indices = ( - numpy.arange(len(peptides)) * self.max_alleles + top_allele_index) - top_score = ensemble_scores.flatten()[top_allele_flat_indices] - result_df = pandas.DataFrame({"peptide": peptides.sequences}) - result_df["allele"] = alleles.alleles.flatten()[top_allele_flat_indices] - result_df["score"] = top_score - - if include_details: - for i in range(self.max_alleles): - result_df["allele%d" % (i + 1)] = alleles.allele[:, i] - result_df["allele%d score" % (i + 1)] = ensemble_scores[:, i] - result_df["allele%d score low" % (i + 1)] = numpy.percentile( - score_array[:, :, i], 5.0, axis=0) - result_df["allele%d score high" % (i + 1)] = numpy.percentile( - score_array[:, :, i], 95.0, axis=0) - return result_df - - def check_consistency(self): - """ - Verify that self.manifest_df is consistent with instance variables. + % MAX_ALLELES_PER_SAMPLE) - Currently only checks for agreement on the total number of models. + experiment_names = ["experiment1"] * len(peptides) + alleles = { + "experiment1": alleles, + } - Throws AssertionError if inconsistent. - """ - assert len(self.manifest_df) == len(self.models), ( - "Manifest seems out of sync with models: %d vs %d entries: \n%s"% ( - len(self.manifest_df), - len(self.models), - str(self.manifest_df))) + df = self.get_affinity_predictions( + peptides=peptides, + experiment_names=experiment_names, + alleles=alleles, + verbose=verbose) + df["affinity_score"] = from_ic50(df.tightest_affinity) + + if (n_flanks is None) != (c_flanks is None): + raise ValueError("Specify both or neither of n_flanks, c_flanks") - def save(self, models_dir, model_names_to_write=None, write_metadata=True): + df["cleavage_prediction"] = self.get_cleavage_predictions( + peptides=df.peptide.values, + n_flanks=n_flanks, + c_flanks=c_flanks, + verbose=verbose) + + model_name = 'with_flanks' if n_flanks is not None else "without_flanks" + model = self.get_model(model_name) + df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1] + return df + + def save(self, models_dir): """ Serialize the predictor to a directory on disk. If the directory does not exist it will be created. - The serialization format consists of a file called "manifest.csv" with - the configurations of each Class1PresentationNeuralNetwork, along with - per-network files giving the model weights. - Parameters ---------- models_dir : string Path to directory. It will be created if it doesn't exist. """ - self.check_consistency() - if model_names_to_write is None: - # Write all models - model_names_to_write = self.manifest_df.model_name.values + if self.weights_dataframe is None: + raise RuntimeError("Can't save before fitting") if not exists(models_dir): mkdir(models_dir) - sub_manifest_df = self.manifest_df.loc[ - self.manifest_df.model_name.isin(model_names_to_write) - ].copy() - - # Network JSON configs may have changed since the models were added, - # for example due to changes to the allele representation layer. - # So we update the JSON configs here also. - updated_network_config_jsons = [] - for (_, row) in sub_manifest_df.iterrows(): - updated_network_config_jsons.append( - json.dumps(row.model.get_config(), cls=NumpyJSONEncoder)) - weights_path = self.weights_path(models_dir, row.model_name) - save_weights(row.model.get_weights(), weights_path) - logging.info("Wrote: %s", weights_path) - sub_manifest_df["config_json"] = updated_network_config_jsons - self.manifest_df.loc[ - sub_manifest_df.index, - "config_json" - ] = updated_network_config_jsons - - write_manifest_df = self.manifest_df[[ - c for c in self.manifest_df.columns if c != "model" - ]] - manifest_path = join(models_dir, "manifest.csv") - write_manifest_df.to_csv(manifest_path, index=False) - logging.info("Wrote: %s", manifest_path) - - if write_metadata: - # Write "info.txt" - info_path = join(models_dir, "info.txt") - rows = [ - ("trained on", time.asctime()), - ("package ", "mhcflurry %s" % __version__), - ("hostname ", gethostname()), - ("user ", getuser()), - ] - pandas.DataFrame(rows).to_csv( - info_path, sep="\t", header=False, index=False) - - if self.metadata_dataframes: - for (name, df) in self.metadata_dataframes.items(): - metadata_df_path = join(models_dir, "%s.csv.bz2" % name) - df.to_csv(metadata_df_path, index=False, compression="bz2") - - # Save allele sequences - if self.allele_to_sequence is not None: - allele_to_sequence_df = pandas.DataFrame( - list(self.allele_to_sequence.items()), - columns=['allele', 'sequence'] - ) - allele_to_sequence_df.to_csv( - join(models_dir, "allele_sequences.csv"), index=False) - logging.info("Wrote: %s", join(models_dir, "allele_sequences.csv")) + # Save underlying predictors + self.affinity_predictor.save(join(models_dir, "affinity_predictor")) + if self.cleavage_predictor_with_flanks is not None: + self.cleavage_predictor_with_flanks.save( + join(models_dir, "cleavage_predictor_with_flanks")) + if self.cleavage_predictor_without_flanks is not None: + self.cleavage_predictor_without_flanks.save( + join(models_dir, "cleavage_predictor_without_flanks")) + + # Save model coefficients. + self.weights_dataframe.to_csv(join(models_dir, "weights.csv")) + + # Write "info.txt" + info_path = join(models_dir, "info.txt") + rows = [ + ("trained on", time.asctime()), + ("package ", "mhcflurry %s" % __version__), + ("hostname ", gethostname()), + ("user ", getuser()), + ] + pandas.DataFrame(rows).to_csv( + info_path, sep="\t", header=False, index=False) + + if self.metadata_dataframes: + for (name, df) in self.metadata_dataframes.items(): + metadata_df_path = join(models_dir, "%s.csv.bz2" % name) + df.to_csv(metadata_df_path, index=False, compression="bz2") + @classmethod def load(cls, models_dir=None, max_models=None): @@ -304,7 +325,8 @@ class Class1PresentationPredictor(object): used. max_models : int, optional - Maximum number of models to load + Maximum number of affinity and cleavage (counted separately) + models to load Returns ------- @@ -313,30 +335,28 @@ class Class1PresentationPredictor(object): if models_dir is None: models_dir = get_default_class1_presentation_models_dir() - manifest_path = join(models_dir, "manifest.csv") - manifest_df = pandas.read_csv(manifest_path, nrows=max_models) + affinity_predictor = Class1AffinityPredictor.load( + join(models_dir, "affinity_predictor"), max_models=max_models) - models = [] - for (_, row) in manifest_df.iterrows(): - weights_filename = cls.weights_path(models_dir, row.model_name) - config = json.loads(row.config_json) - model = Class1PresentationNeuralNetwork.from_config( - config, - weights=load_weights(abspath(weights_filename))) - models.append(model) + cleavage_predictor_with_flanks = None + if exists(join(models_dir, "cleavage_predictor_with_flanks")): + cleavage_predictor_with_flanks = Class1CleavagePredictor.load( + join(models_dir, "cleavage_predictor_with_flanks"), + max_models=max_models) - manifest_df["model"] = models + cleavage_predictor_without_flanks = None + if exists(join(models_dir, "cleavage_predictor_without_flanks")): + cleavage_predictor_without_flanks = Class1CleavagePredictor.load( + join(models_dir, "cleavage_predictor_without_flanks"), + max_models=max_models) - # Load allele sequences - allele_to_sequence = None - if exists(join(models_dir, "allele_sequences.csv")): - allele_to_sequence = pandas.read_csv( - join(models_dir, "allele_sequences.csv"), - index_col=0).iloc[:, 0].to_dict() + weights_dataframe = pandas.read_csv( + join(models_dir, "weights.csv"), + index_col=0) - logging.info("Loaded %d class1 presentation models", len(models)) result = cls( - models=models, - allele_to_sequence=allele_to_sequence, - manifest_df=manifest_df) + affinity_predictor=affinity_predictor, + cleavage_predictor_with_flanks=cleavage_predictor_with_flanks, + cleavage_predictor_without_flanks=cleavage_predictor_without_flanks, + weights_dataframe=weights_dataframe) return result diff --git a/mhcflurry/multiple_allele_encoding.py b/mhcflurry/multiple_allele_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..3b51529b3a7c0dac78265c092fba31cc7046cb6d --- /dev/null +++ b/mhcflurry/multiple_allele_encoding.py @@ -0,0 +1,90 @@ +import numpy + +from copy import copy + +from .allele_encoding import AlleleEncoding + + +class MultipleAlleleEncoding(object): + def __init__( + self, + experiment_names=[], + experiment_to_allele_list={}, + max_alleles_per_experiment=6, + allele_to_sequence=None, + borrow_from=None): + + padded_experiment_to_allele_list = {} + for (name, alleles) in experiment_to_allele_list.items(): + assert len(alleles) > 0 + assert len(alleles) <= max_alleles_per_experiment + alleles_with_mask = alleles + [None] * ( + max_alleles_per_experiment - len(alleles)) + padded_experiment_to_allele_list[name] = alleles_with_mask + + flattened_allele_list = [] + for name in experiment_names: + flattened_allele_list.extend(padded_experiment_to_allele_list[name]) + + self.allele_encoding = AlleleEncoding( + alleles=flattened_allele_list, + allele_to_sequence=allele_to_sequence, + borrow_from=borrow_from + ) + self.max_alleles_per_experiment = max_alleles_per_experiment + self.experiment_names = numpy.array(experiment_names) + + def append_alleles(self, alleles): + extended_alleles = list(self.allele_encoding.alleles) + for allele in alleles: + extended_alleles.append(allele) + extended_alleles.extend( + [None] * (self.max_alleles_per_experiment - 1)) + + assert len(extended_alleles) % self.max_alleles_per_experiment == 0, ( + len(extended_alleles)) + + self.allele_encoding = AlleleEncoding( + alleles=extended_alleles, + borrow_from=self.allele_encoding) + + self.experiment_names = numpy.concatenate([ + self.experiment_names, + numpy.tile(None, len(alleles)) + ]) + + @property + def indices(self): + return self.allele_encoding.indices.values.reshape( + (-1, self.max_alleles_per_experiment)) + + @property + def alleles(self): + return numpy.reshape( + self.allele_encoding.alleles.values, + (-1, self.max_alleles_per_experiment)) + + def compact(self): + result = copy(self) + result.allele_encoding = self.allele_encoding.compact() + return result + + def allele_representations(self, encoding_name): + return self.allele_encoding.allele_representations(encoding_name) + + @property + def allele_to_sequence(self): + return self.allele_encoding.allele_to_sequence + + def fixed_length_vector_encoded_sequences(self, encoding_name): + raise NotImplementedError() + + def shuffle_in_place(self, shuffle_permutation=None): + alleles_matrix = self.alleles + if shuffle_permutation is None: + shuffle_permutation = numpy.random.permutation(len(alleles_matrix)) + self.allele_encoding = AlleleEncoding( + alleles=alleles_matrix[shuffle_permutation].flatten(), + borrow_from=self.allele_encoding + ) + self.experiment_names = self.experiment_names[shuffle_permutation] \ No newline at end of file diff --git a/test/data/multiallelic.benchmark.small.csv.bz2 b/test/data/multiallelic.benchmark.small.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..bb5033ed65d0b2c3f652e66ca7787c4b404a16cf Binary files /dev/null and b/test/data/multiallelic.benchmark.small.csv.bz2 differ diff --git a/test/test_class1_presentation_neural_network.py b/test/test_class1_presentation_neural_network.py deleted file mode 100644 index 1e05949ca1093904bb6518d57b233398b54db305..0000000000000000000000000000000000000000 --- a/test/test_class1_presentation_neural_network.py +++ /dev/null @@ -1,410 +0,0 @@ -import logging -logging.getLogger('tensorflow').disabled = True -logging.getLogger('matplotlib').disabled = True - -import pandas -import argparse -import sys -import copy -import os -import tempfile -import pickle - -from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_equal -from nose.tools import assert_greater, assert_less -import numpy -from random import shuffle - -from sklearn.metrics import roc_auc_score - -from mhcflurry import Class1AffinityPredictor -from mhcflurry.class1_cleavage_predictor import Class1CleavagePredictor -from mhcflurry.allele_encoding import MultipleAlleleEncoding -from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork -from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor -from mhcflurry.encodable_sequences import EncodableSequences -from mhcflurry.downloads import get_path -from mhcflurry.regression_target import from_ic50 -from mhcflurry.common import random_peptides, positional_frequency_matrix -from mhcflurry.testing_utils import cleanup, startup -from mhcflurry.amino_acid import COMMON_AMINO_ACIDS -from mhcflurry.custom_loss import MultiallelicMassSpecLoss -from mhcflurry.regression_target import to_ic50 - - -# disable -#sys.exit(0) - -################################################### -# SETUP -################################################### - -COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) - -AFFINITY_PREDICTOR = None -CLEAVAGE_PREDICTOR = None - - -def setup(): - global AFFINITY_PREDICTOR - global CLEAVAGE_PREDICTOR - startup() - AFFINITY_PREDICTOR = Class1AffinityPredictor.load( - get_path("models_class1_pan_variants", "models.affinity_only"), - optimization_level=0, - max_models=1) - CLEAVAGE_PREDICTOR = Class1CleavagePredictor.load(max_models=1) - - - -def teardown(): - global AFFINITY_PREDICTOR - global CLEAVAGE_PREDICTOR - AFFINITY_PREDICTOR = None - CLEAVAGE_PREDICTOR = None - cleanup() - - -def data_path(name): - ''' - Return the absolute path to a file in the test/data directory. - The name specified should be relative to test/data. - ''' - return os.path.join(os.path.dirname(__file__), "data", name) - -#disable -#sys.exit(0) - - -################################################### -# UTILITY FUNCTIONS -################################################### - -def scramble_peptide(peptide): - lst = list(peptide) - shuffle(lst) - return "".join(lst) - - -def make_motif(presentation_predictor, allele, peptides, frac=0.01, master_allele_encoding=None): - if master_allele_encoding is not None: - alleles = MultipleAlleleEncoding(borrow_from=master_allele_encoding) - alleles.append_alleles([allele] * len(peptides)) - else: - alleles = [allele] - - peptides = EncodableSequences.create(peptides) - predictions = presentation_predictor.predict( - peptides=peptides, - alleles=alleles, - ) - random_predictions_df = pandas.DataFrame({"peptide": peptides.sequences}) - random_predictions_df["prediction"] = predictions - random_predictions_df = random_predictions_df.sort_values( - "prediction", ascending=False) - top = random_predictions_df.iloc[:int(len(random_predictions_df) * frac)] - matrix = positional_frequency_matrix(top.peptide.values) - return matrix - - -################################################### -# TESTS -################################################### - -def Xtest_build(): - global AFFINITY_PREDICTOR - global CLEAVAGE_PREDICTOR - - for include_cleavage in [False, True, False, True]: - print("Include cleavage: %s" % include_cleavage) - model = Class1PresentationNeuralNetwork( - include_cleavage=include_cleavage) - model.build( - affinity_model=AFFINITY_PREDICTOR.class1_pan_allele_models[0], - cleavage_model=CLEAVAGE_PREDICTOR.models[0]) - network = model.network - print(network.summary()) - - -def test_synthetic_allele_refinement(): - """ - Test that in a synthetic example the model is able to learn that HLA-C*01:02 - prefers P at position 3. - """ - refine_allele = "HLA-C*01:02" - alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*03:01", - "HLA-B*15:01", refine_allele] - peptides_per_allele = [2000, 1000, 500, 1500, 1200, 800, ] - - allele_to_peptides = dict(zip(alleles, peptides_per_allele)) - - length = 9 - - train_with_ms = pandas.read_csv(get_path("data_curated", - "curated_training_data.csv.bz2")) - train_no_ms = pandas.read_csv( - get_path("data_curated", "curated_training_data.affinity.csv.bz2")) - - def filter_df(df): - return df.loc[ - (df.allele.isin(alleles)) & (df.peptide.str.len() == length)] - - train_with_ms = filter_df(train_with_ms) - train_no_ms = filter_df(train_no_ms) - - ms_specific = train_with_ms.loc[ - ~train_with_ms.peptide.isin(train_no_ms.peptide)] - - train_peptides = [] - train_true_alleles = [] - for allele in alleles: - peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample( - n=allele_to_peptides[allele]) - train_peptides.extend(peptides) - train_true_alleles.extend([allele] * len(peptides)) - - hits_df = pandas.DataFrame({"peptide": train_peptides}) - hits_df["true_allele"] = train_true_alleles - hits_df["hit"] = 1.0 - - decoys_df = hits_df.copy() - decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide) - decoys_df["true_allele"] = "" - decoys_df["hit"] = 0.0 - - train_df = pandas.concat( - [hits_df, decoys_df], ignore_index=True).sample(frac=1.0) - - (affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models - presentation_model = Class1PresentationNeuralNetwork( - include_cleavage=False, - trainable_affinity_predictor=False, - auxiliary_input_features=["gene"], - minibatch_size=1024, - max_epochs=10, - learning_rate=0.001, - patience=5, - min_delta=0.0) - presentation_model.build(affinity_model) - print(presentation_model.network.summary()) - - allele_encoding = MultipleAlleleEncoding( - experiment_names=["experiment1"] * len(train_df), - experiment_to_allele_list={ - "experiment1": alleles, - }, max_alleles_per_experiment=6, - allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence, ) - - allele_encoding = allele_encoding.compact() - - presentation_predictor = Class1PresentationPredictor( - models=[presentation_model], - allele_to_sequence=allele_encoding.allele_encoding.allele_to_sequence) - - pre_predictions = presentation_model.predict( - peptides=train_df.peptide.values, - allele_encoding=allele_encoding) - - #expected_pre_predictions = from_ic50(affinity_model.predict( - # peptides=numpy.repeat(train_df.peptide.values, len(alleles)), - # allele_encoding=mms_allele_encoding.allele_encoding, )).reshape( - # (-1, len(alleles))) - #assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) - - random_peptides_encodable = EncodableSequences.create( - random_peptides(20000, 9)) - - original_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print("Original motif proline-3 rate: ", original_motif.loc[3, "P"]) - assert_less(original_motif.loc[3, "P"], 0.1) - - iteration_box = [0] - - def progress(label = None): - if label is None: - label = str(iteration_box[0]) - iteration_box[0] += 1 - print("*** iteration ", label, "***") - predictions_df = presentation_predictor.predict_to_dataframe( - peptides=train_df.peptide.values, - alleles=allele_encoding) - merged_df = pandas.merge(train_df, predictions_df, on="peptide") - merged_hit_df = merged_df.loc[merged_df.hit == 1.0] - correct_allele_fraction = ( - merged_hit_df.allele == merged_hit_df.true_allele).mean() - print("Correct allele fraction", correct_allele_fraction) - print( - "Mean score for hit", - merged_df.loc[merged_df.hit == 1.0].score.mean()) - print( - "Mean score for decoy", - merged_df.loc[merged_df.hit == 0.0].score.mean()) - print("Scores for hit", - merged_df.loc[merged_df.hit == 1.0].score.values) - print("Scores for decoy", - merged_df.loc[merged_df.hit == 0.0].score.values) - print("Weights", presentation_model.network.get_layer("per_allele_output").get_weights()) - auc = roc_auc_score(merged_df.hit.values, merged_df.score.values) - print("AUC", auc) - return (auc, correct_allele_fraction) - - (pre_auc, pre_correct_allele_fraction) = progress(label="Pre fitting") - """ - presentation_model.fit( - peptides=train_df.peptide.values, - targets=train_df.hit.values, - allele_encoding=allele_encoding, - progress_callback=progress) - - progress(label="Done fitting first round") - """ - - presentation_model.set_trainable(trainable_affinity_predictor=True) - presentation_model.hyperparameters['learning_rate'] = 1e-4 - presentation_model.fit( - peptides=train_df.peptide.values, - targets=train_df.hit.values, - allele_encoding=allele_encoding, - progress_callback=progress) - - (post_auc, post_correct_allele_fraction) = progress(label="Done fitting second round") - - final_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print("Final motif proline-3 rate: ", final_motif.loc[3, "P"]) - - assert_greater(post_auc, pre_auc) - assert_greater( - post_correct_allele_fraction, pre_correct_allele_fraction - 0.05) - assert_greater(final_motif.loc[3, "P"], original_motif.loc[3, "P"]) - - - -def Xtest_real_data_multiallelic_refinement(max_epochs=10): - """ - Test on real data that we can learn that HLA-A*02:20 has a preference K at - position 1. - """ - (affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models - presentation_model = Class1PresentationNeuralNetwork( - auxiliary_input_features=["gene"], - batch_generator_batch_size=1024, - max_epochs=max_epochs, - learning_rate=0.001, - patience=5, - min_delta=0.0, - random_negative_rate=0, - random_negative_constant=0) - presentation_model.load_from_class1_neural_network(affinity_model) - - presentation_predictor = Class1PresentationPredictor( - models=[presentation_model], - allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence) - - ms_df = pandas.read_csv( - get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2")) - ms_df = ms_df.loc[ - (ms_df.mhc_class == "I") & (~ms_df.protein_ensembl.isnull())].copy() - - sample_table = ms_df.drop_duplicates( - "sample_id").set_index("sample_id").loc[ms_df.sample_id.unique()] - grouped = ms_df.groupby("sample_id").nunique() - for col in sample_table.columns: - if (grouped[col] > 1).any(): - del sample_table[col] - sample_table["alleles"] = sample_table.hla.str.split() - - multi_train_hit_df = ms_df.loc[ - ms_df.sample_id == "RA957" - ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True) - multi_train_hit_df["label"] = 1.0 - - multi_train_decoy_df = ms_df.loc[ - (ms_df.sample_id == "CD165") & - (~ms_df.peptide.isin(multi_train_hit_df.peptide.unique())) - ].drop_duplicates("peptide")[["peptide"]] - (multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique() - multi_train_decoy_df["label"] = 0.0 - - multi_train_df = pandas.concat( - [multi_train_hit_df, multi_train_decoy_df], ignore_index=True) - multi_train_df["is_affinity"] = False - - multi_train_alleles = set() - for alleles in sample_table.loc[multi_train_df.sample_id.unique()].alleles: - multi_train_alleles.update(alleles) - multi_train_alleles = sorted(multi_train_alleles) - - pan_train_df = pandas.read_csv( - get_path( - "models_class1_pan", "models.combined/train_data.csv.bz2")) - - pan_sub_train_df = pan_train_df.loc[ - #pan_train_df.allele.isin(multi_train_alleles), - :, - ["peptide", "allele", "measurement_inequality", "measurement_value"] - ] - pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"] - del pan_sub_train_df["measurement_value"] - pan_sub_train_df["is_affinity"] = True - - allele_encoding = MultipleAlleleEncoding( - experiment_names=multi_train_df.sample_id.values, - experiment_to_allele_list=sample_table.alleles.to_dict(), - max_alleles_per_experiment=sample_table.alleles.str.len().max(), - allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence, - ) - allele_encoding.append_alleles(pan_sub_train_df.allele.values) - allele_encoding = allele_encoding.compact() - - combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df]) - - refine_allele = "HLA-A*02:20" - random_peptides_encodable = EncodableSequences.create( - random_peptides(10000, 9)) - - original_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print( - refine_allele, - "original motif lysine-1 rate: ", - original_motif.loc[1, "K"]) - - def progress(): - motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele, - master_allele_encoding=allele_encoding.allele_encoding) - print( - refine_allele, - "current motif lysine-1 rate: ", - motif.loc[1, "K"]) - - presentation_model.fit( - peptides=combined_train_df.peptide.values, - targets=combined_train_df.label.values, - allele_encoding=allele_encoding, - affinities_mask=combined_train_df.is_affinity.values, - inequalities=combined_train_df.measurement_inequality.values, - progress_callback=progress) - - final_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print(refine_allele, "final motif lysine-1 rate: ", final_motif.loc[1, "K"]) - - assert_greater(final_motif.loc[1, "K"], original_motif.loc[1, "K"]) - -if __name__ == "__main__": - setup() - test_real_data_multiallelic_refinement() - teardown() \ No newline at end of file diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index 55d160141ec30027263f3ea4629efe513ef7d214..b1046b7222708232f5c06ab7958e04b4a60004c0 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -10,160 +10,117 @@ from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_e from nose.tools import assert_greater, assert_less import numpy -from mhcflurry import Class1AffinityPredictor -from mhcflurry.allele_encoding import MultipleAlleleEncoding -from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork +from sklearn.metrics import roc_auc_score + +from mhcflurry import Class1AffinityPredictor, Class1CleavagePredictor from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor from mhcflurry.downloads import get_path from mhcflurry.common import random_peptides from mhcflurry.testing_utils import cleanup, startup from mhcflurry.regression_target import to_ic50 +from . import data_path + AFFINITY_PREDICTOR = None +CLEAVAGE_PREDICTOR = None +CLEAVAGE_PREDICTOR_NO_FLANKING = None + def setup(): global AFFINITY_PREDICTOR + global CLEAVAGE_PREDICTOR + global CLEAVAGE_PREDICTOR_NO_FLANKING startup() AFFINITY_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan", "models.combined"), optimization_level=0, max_models=1) + CLEAVAGE_PREDICTOR = Class1CleavagePredictor.load( + get_path("models_class1_cleavage", "models"), max_models=1) + CLEAVAGE_PREDICTOR_NO_FLANKING = Class1CleavagePredictor.load( + get_path("models_class1_cleavage_variants", "models.selected.no_flank"), + max_models=1) def teardown(): global AFFINITY_PREDICTOR + global CLEAVAGE_PREDICTOR + global CLEAVAGE_PREDICTOR_NO_FLANKING AFFINITY_PREDICTOR = None + CLEAVAGE_PREDICTOR = None + CLEAVAGE_PREDICTOR_NO_FLANKING = None cleanup() -def Xtest_basic(): - affinity_predictor = AFFINITY_PREDICTOR - models = [] - for affinity_network in affinity_predictor.class1_pan_allele_models: - presentation_network = Class1PresentationNeuralNetwork( - optimizer="adam", - random_negative_rate=0.0, - random_negative_constant=0, - max_epochs=25, - learning_rate=0.001, - batch_generator_batch_size=256) - presentation_network.load_from_class1_neural_network(affinity_network) - models.append(presentation_network) +def test_basic(): + df = pandas.read_csv(data_path("multiallelic.benchmark.small.csv.bz2")) + train_df = df.loc[ + df.sample_id.isin(sorted(df.sample_id.unique())[:3]) + ] + test_df = df.loc[ + ~df.sample_id.isin(train_df.sample_id.unique()) + ] + test_df = test_df.sample(frac=0.01, weights=test_df.hit + 0.01) + + experiment_to_alleles = ( + df.drop_duplicates("sample_id").set_index("sample_id").hla.str.split().to_dict()) predictor = Class1PresentationPredictor( - models=models, - allele_to_sequence=affinity_predictor.allele_to_sequence) - - alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:02"] - - df = pandas.DataFrame(index=numpy.unique(random_peptides(1000, length=9))) - for allele in alleles: - df[allele] = affinity_predictor.predict( - df.index.values, allele=allele) - df["tightest_affinity"] = df.min(1) - df["tightest_allele"] = df.idxmin(1) - - # Test untrained predictor gives predictions that match the affinity - # predictor - df_predictor = predictor.predict_to_dataframe( - peptides=df.index.values, - alleles=alleles) - merged_df = pandas.merge( - df, df_predictor.set_index("peptide"), left_index=True, right_index=True) - - print(merged_df) - - assert_allclose( - merged_df["tightest_affinity"], merged_df["affinity"], rtol=1e-5) - assert_allclose( - merged_df["tightest_affinity"], to_ic50(merged_df["score"]), rtol=1e-5) - assert_array_equal(merged_df["tightest_allele"], merged_df["allele"]) - - # Test saving and loading + affinity_predictor=AFFINITY_PREDICTOR, + cleavage_predictor_without_flanks=CLEAVAGE_PREDICTOR_NO_FLANKING, + cleavage_predictor_with_flanks=CLEAVAGE_PREDICTOR) + + predictor.fit( + targets=train_df.hit.values, + peptides=train_df.peptide.values, + experiment_names=train_df.sample_id.values, + alleles=experiment_to_alleles, + n_flanks=train_df.n_flank.values, + c_flanks=train_df.c_flank.values, + verbose=2) + + def add_prediction_cols(test_df, predictor): + test_df["prediction1"] = predictor.predict( + peptides=test_df.peptide.values, + experiment_names=test_df.sample_id.values, + alleles=experiment_to_alleles, + n_flanks=test_df.n_flank.values, + c_flanks=test_df.c_flank.values, + verbose=2) + + test_df["prediction2"] = predictor.predict( + peptides=test_df.peptide.values, + experiment_names=test_df.sample_id.values, + alleles=experiment_to_alleles, + verbose=2) + + add_prediction_cols(test_df, predictor) + + score1 = roc_auc_score(test_df.hit.values, test_df.prediction1.values) + score2 = roc_auc_score(test_df.hit.values, test_df.prediction2.values) + + print("AUC", score1, score2) + + assert_greater(score1, 0.8) + assert_greater(score2, 0.8) + + # Test saving, loading, pickling models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor.save(models_dir) predictor2 = Class1PresentationPredictor.load(models_dir) - - df_predictor2 = predictor2.predict_to_dataframe( - peptides=df.index.values, - alleles=alleles) - assert_array_equal(df_predictor.values, df_predictor2.values) - - # Test pickling predictor3 = pickle.loads( pickle.dumps(predictor, protocol=pickle.HIGHEST_PROTOCOL)) predictor4 = pickle.loads( pickle.dumps(predictor2, protocol=pickle.HIGHEST_PROTOCOL)) - df_predictor3 = predictor3.predict_to_dataframe( - peptides=df.index.values, - alleles=alleles) - df_predictor4 = predictor4.predict_to_dataframe( - peptides=df.index.values, - alleles=alleles) - assert_array_equal(df_predictor.values, df_predictor3.values) - assert_array_equal(df_predictor.values, df_predictor4.values) - - # Test that fitting a model changes the predictor but not the original model - train_df = pandas.DataFrame({ - "peptide": numpy.concatenate([ - random_peptides(256, length=length) - for length in [9] - ]), - }) - train_df["allele"] = "HLA-A*02:20" - train_df["experiment"] = "experiment1" - train_df["label"] = train_df.peptide.str.match("^[KSEQ]").astype("float32") - train_df["pre_train_affinity_prediction"] = affinity_predictor.predict( - train_df.peptide.values, alleles=train_df.allele.values) - allele_encoding = MultipleAlleleEncoding( - experiment_names=train_df.experiment.values, - experiment_to_allele_list={ - "experiment1": ["HLA-A*02:20", "HLA-A*02:01"], - }, - allele_to_sequence=predictor.allele_to_sequence) - model = predictor4.models[0] - new_predictor = Class1PresentationPredictor( - models=[model], - allele_to_sequence=predictor4.allele_to_sequence) - train_df["original_score"] = new_predictor.predict( - train_df.peptide.values, alleles=["HLA-A*02:20"]) - model.fit( - peptides=train_df.peptide.values, - targets=train_df.label.values, - allele_encoding=allele_encoding) - train_df["updated_score"] = new_predictor.predict( - train_df.peptide.values, - alleles=["HLA-A*02:20"]) - train_df["updated_affinity"] = new_predictor.predict_to_dataframe( - train_df.peptide.values, - alleles=["HLA-A*02:20"]).affinity.values - train_df["score_diff"] = train_df.updated_score - train_df.original_score - mean_change = train_df.groupby("label").score_diff.mean() - print("Mean change:") - print(mean_change) - assert_greater(mean_change[1.0], mean_change[0.0]) - print(train_df) - train_df["post_train_affinity_prediction"] = affinity_predictor.predict( - train_df.peptide.values, - alleles=train_df.allele.values) - assert_array_equal( - train_df.pre_train_affinity_prediction.values, - train_df.post_train_affinity_prediction.values) - - (affinity_model,) = affinity_predictor.class1_pan_allele_models - model.copy_weights_to_affinity_model(affinity_model) - train_df["post_copy_weights_prediction"] = affinity_predictor.predict( - train_df.peptide.values, alleles=train_df.allele.values) - assert_allclose( - train_df.updated_affinity.values, - train_df.post_copy_weights_prediction.values, - rtol=1e-5) - train_df["affinity_diff"] = ( - train_df.post_copy_weights_prediction - - train_df.post_train_affinity_prediction) - median_affinity_change = train_df.groupby("label").affinity_diff.median() - print("Median affinity change:") - print(median_affinity_change) - assert_less(median_affinity_change[1.0], median_affinity_change[0.0]) + for (i, other_predictor) in enumerate([predictor2, predictor3, predictor4]): + print("Testing identity", i + 1) + other_test_df = test_df.copy() + del other_test_df["prediction1"] + del other_test_df["prediction2"] + add_prediction_cols(other_test_df, other_predictor) + numpy.testing.assert_array_almost_equal( + test_df["prediction1"], other_test_df["prediction1"], decimal=6) + numpy.testing.assert_array_almost_equal( + test_df["prediction2"], other_test_df["prediction2"], decimal=6)