diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index e5e59cf1acb5f84777fbd756b84448ad0ba62665..eba9de5f1c04b05b0a39c89dc30fc69612b4ad53 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -20,6 +20,11 @@ from __future__ import ( import collections from copy import copy +import pandas +import numpy +from six import StringIO + + COMMON_AMINO_ACIDS = collections.OrderedDict(sorted({ "A": "Alanine", "R": "Arginine", @@ -47,3 +52,105 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown" AMINO_ACID_INDEX = dict( (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN)) + +AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) + +BLOSUM62_MATRIX = pandas.read_table(StringIO(""" + A R N D C Q E G H I L K M F P S T W Y V X +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] +assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() + +ENCODING_DFS = { + "BLOSUM62": BLOSUM62_MATRIX, + "one-hot": pandas.DataFrame([ + [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] + for j in range(len(AMINO_ACIDS)) + ], index=AMINO_ACIDS, columns=AMINO_ACIDS) +} + + +def available_vector_encodings(): + """ + Return list of supported amino acid vector encodings. + + Returns + ------- + list of string + + """ + return list(ENCODING_DFS) + + +def vector_encoding_length(name): + """ + Return the length of the given vector encoding. + + Parameters + ---------- + name : string + + Returns + ------- + int + """ + return ENCODING_DFS[name].shape[1] + + +def index_encoding(sequences, letter_to_index_dict): + """ + Given a sequence of n strings all of length k, return a k * n array where + the (i, j)th element is letter_to_index_dict[sequence[i][j]]. + + Parameters + ---------- + sequences : list of length n of strings of length k + letter_to_index_dict : dict : string -> int + + Returns + ------- + numpy.array of integers with shape (k, n) + """ + df = pandas.DataFrame(iter(s) for s in sequences) + result = df.replace(letter_to_index_dict) + return result.values + + +def fixed_vectors_encoding(sequences, letter_to_vector_function): + """ + Given a sequence of n strings all of length k, return a n * k * m array where + the (i, j)th element is letter_to_vector_function(sequence[i][j]). + + Parameters + ---------- + sequences : list of length n of strings of length k + letter_to_vector_function : function : string -> vector of length m + + Returns + ------- + numpy.array of integers with shape (n, k, m) + """ + arr = numpy.array([list(s) for s in sequences]) + result = numpy.vectorize( + letter_to_vector_function, signature='()->(n)')(arr) + return result \ No newline at end of file diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py index bfba10cb52c896dd1bd139d08b5fe67e3918510c..21a92e72e1817667d5a5981946befa4240cb0c0f 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py @@ -5,14 +5,19 @@ import json from os.path import join, exists from six import string_types import logging +import warnings import numpy import pandas +from numpy.testing import assert_equal import mhcnames from ..encodable_sequences import EncodableSequences from ..downloads import get_path +from ..common import random_peptides +from ..percent_rank_transform import PercentRankTransform +from ..regression_target import to_ic50 from .class1_neural_network import Class1NeuralNetwork @@ -32,7 +37,8 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, - manifest_df=None): + manifest_df=None, + allele_to_percent_rank_transform=None): """ Parameters ---------- @@ -50,6 +56,9 @@ class Class1AffinityPredictor(object): Only required if you want to update an existing serialization of a Class1AffinityPredictor. Otherwise this dataframe will be generated automatically based on the supplied models. + + allele_to_percent_rank_transform : dict of string -> PercentRankTransform, optional + PercentRankTransform instances to use for each allele """ if allele_to_allele_specific_models is None: @@ -86,6 +95,11 @@ class Class1AffinityPredictor(object): columns=["model_name", "allele", "config_json", "model"]) self.manifest_df = manifest_df + if allele_to_percent_rank_transform is None: + allele_to_percent_rank_transform = {} + self.allele_to_percent_rank_transform = allele_to_percent_rank_transform + + @property def supported_alleles(self): """ @@ -165,6 +179,21 @@ class Class1AffinityPredictor(object): write_manifest_df.to_csv(manifest_path, index=False) logging.info("Wrote: %s" % manifest_path) + if self.allele_to_percent_rank_transform: + percent_ranks_df = None + for (allele, transform) in self.allele_to_percent_rank_transform.items(): + series = transform.to_series() + if percent_ranks_df is None: + percent_ranks_df = pandas.DataFrame(index=series.index) + assert_equal(series.index.values, percent_ranks_df.index.values) + percent_ranks_df[allele] = series + percent_ranks_path = join(models_dir, "percent_ranks.csv") + percent_ranks_df.to_csv( + percent_ranks_path, + index=True, + index_label="bin") + logging.info("Wrote: %s" % percent_ranks_path) + @staticmethod def load(models_dir=None, max_models=None): """ @@ -211,11 +240,20 @@ class Class1AffinityPredictor(object): join(models_dir, "pseudosequences.csv"), index_col="allele").to_dict() + allele_to_percent_rank_transform = {} + percent_ranks_path = join(models_dir, "percent_ranks.csv") + if exists(percent_ranks_path): + percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0) + for allele in percent_ranks_df.columns: + allele_to_percent_rank_transform[allele] = ( + PercentRankTransform.from_series(percent_ranks_df[allele])) + logging.info( - "Loaded %d class1 pan allele predictors, %d pseudosequences, and " - "%d allele specific models: %s" % ( + "Loaded %d class1 pan allele predictors, %d pseudosequences, " + "%d percent rank distributions, and %d allele specific models: %s" % ( len(class1_pan_allele_models), len(pseudosequences) if pseudosequences else 0, + len(allele_to_percent_rank_transform), sum(len(v) for v in allele_to_allele_specific_models.values()), ", ".join( "%s (%d)" % (allele, len(v)) @@ -226,7 +264,9 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, allele_to_pseudosequence=pseudosequences, - manifest_df=manifest_df) + manifest_df=manifest_df, + allele_to_percent_rank_transform=allele_to_percent_rank_transform, + ) return result @staticmethod @@ -441,6 +481,88 @@ class Class1AffinityPredictor(object): verbose=verbose) yield model + def calibrate_percentile_ranks( + self, + peptides=None, + num_peptides_per_length=int(1e6), + alleles=None, + bins=None): + """ + Compute the cumulative distribution of ic50 values for a set of alleles + over a large universe of random peptides, to enable computing quantiles in + this distribution later. + + Parameters + ---------- + peptides : sequence of string, optional + Peptides to use + num_peptides_per_length : int, optional + If peptides argument is not specified, then num_peptides_per_length + peptides are randomly sampled from a uniform distribution for each + supported length + alleles : sequence of string, optional + Alleles to perform calibration for. If not specified all supported + alleles will be calibrated. + """ + if bins is None: + bins = to_ic50(numpy.linspace(1, 0, 1000)) + + if alleles is None: + alleles = self.supported_alleles + + if peptides is None: + peptides = [] + lengths = range( + self.supported_peptide_lengths[0], + self.supported_peptide_lengths[1] + 1) + for length in lengths: + peptides.extend( + random_peptides(num_peptides_per_length, length)) + + for allele in alleles: + predictions = self.predict(peptides, allele=allele) + transform = PercentRankTransform() + transform.fit(predictions, bins=bins) + self.allele_to_percent_rank_transform[allele] = transform + + def percentile_ranks(self, affinities, allele=None, alleles=None): + """ + Return percentile ranks for the given ic50 affinities and alleles. + + The 'allele' and 'alleles' argument are as in the predict() method. + Specify one of these. + + Parameters + ---------- + affinities : sequence of float + nM affinities + allele : string + alleles : sequence of string + + Returns + ------- + numpy.array of float + """ + if allele is not None: + try: + transform = self.allele_to_percent_rank_transform[allele] + return transform.transform(affinities) + except KeyError: + raise ValueError( + "Allele %s has no percentile rank information" % allele) + + if alleles is None: + raise ValueError("Specify allele or alleles") + + df = pandas.DataFrame({"affinity": affinities}) + df["allele"] = alleles + df["result"] = numpy.nan + for (allele, sub_df) in df.groupby("allele"): + df.loc[sub_df.index, "result"] = self.percentile_ranks\ + (sub_df.affinity, allele=allele) + assert not df.result.isnull().any() + return df.result.values + def predict(self, peptides, alleles=None, allele=None, throw=True): """ Predict nM binding affinities. @@ -472,6 +594,7 @@ class Class1AffinityPredictor(object): alleles=alleles, allele=allele, throw=throw, + include_percentile_ranks=False, ) return df.prediction.values @@ -481,7 +604,8 @@ class Class1AffinityPredictor(object): alleles=None, allele=None, throw=True, - include_individual_model_predictions=False): + include_individual_model_predictions=False, + include_percentile_ranks=True): """ Predict nM binding affinities. Gives more detailed output than `predict` method, including 5-95% prediction intervals. @@ -499,13 +623,17 @@ class Class1AffinityPredictor(object): peptides : EncodableSequences or list of string alleles : list of string allele : string - include_individual_model_predictions : boolean - If True, the predictions of each individual model are incldued as - columns in the result dataframe. throw : boolean If True, a ValueError will be raised in the case of unsupported alleles or peptide lengths. If False, a warning will be logged and the predictions for the unsupported alleles or peptides will be NaN. + include_individual_model_predictions : boolean + If True, the predictions of each individual model are included as + columns in the result dataframe. + include_percentile_ranks : boolean, default True + If True, a "prediction_percentile" column will be included giving the + percentile ranks. If no percentile rank information is available, + this will be ignored with a warning. Returns ------- @@ -619,8 +747,15 @@ class Class1AffinityPredictor(object): columns = [ c for c in df.columns if c not in df_predictions.columns ] - return df[columns] + result = df[columns].copy() + if include_percentile_ranks: + if self.allele_to_percent_rank_transform: + result["prediction_percentile"] = self.percentile_ranks( + df.prediction, alleles=df.allele.values) + else: + warnings.warn("No percentile rank information available.") + return result @staticmethod def save_weights(weights_list, filename): diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py index 13a7f55c8250255a3fdc43a4c5017505801bfe6a..d685669cf43e2260e7b89d4c061510693e83aea9 100644 --- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py +++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py @@ -16,10 +16,8 @@ from keras.layers.normalization import BatchNormalization from mhcflurry.hyperparameters import HyperparameterDefaults -from ..encodable_sequences import ( - EncodableSequences, - available_vector_encodings, - vector_encoding_length) +from ..encodable_sequences import EncodableSequences +from ..amino_acid import available_vector_encodings, vector_encoding_length from ..regression_target import to_ic50, from_ic50 from ..common import random_peptides, amino_acid_distribution diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py index 1128ca99a3dfaef999d2b131ea3f6865faf53127..aa12add41b3739a381a5dfe6a36ee52e5a3a8509 100644 --- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py +++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py @@ -6,6 +6,7 @@ import os import sys import argparse import yaml +import time import pandas @@ -49,6 +50,13 @@ parser.add_argument( action="store_true", default=False, help="Use only quantitative training data") +parser.add_argument( + "--percent-rank-calibration-num-peptides-per-length", + type=int, + default=int(1e5), + help="Number of peptides per length to use to calibrate percent ranks. " + "Set to 0 to disable percent rank calibration. The resulting models will " + "not support percent ranks") parser.add_argument( "--verbosity", type=int, @@ -126,6 +134,15 @@ def run(argv=sys.argv[1:]): affinities=train_data.measurement_value.values, models_dir_for_save=args.out_models_dir) + if args.percent_rank_calibration_num_peptides_per_length > 0: + start = time.time() + print("Performing percent rank calibration.") + predictor.calibrate_percentile_ranks( + num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length) + print("Finished calibrating percent ranks in %0.2f sec." % ( + time.time() - start)) + predictor.save(args.out_models_dir, model_names_to_write=[]) + if __name__ == '__main__': run() diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 4abc94a1032860594bc091ba9043cc94320ffd30..67553d0fd2ae9983b48e60727d81b3a2950ac97f 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -20,116 +20,12 @@ from __future__ import ( import math -import pandas import numpy -from six import StringIO import typechecks from . import amino_acid -AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) - -BLOSUM62_MATRIX = pandas.read_table(StringIO(""" - A R N D C Q E G H I L K M F P S T W Y V X -A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 -R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 -N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 -D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 -C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 -Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 -G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 -H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 -L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 -K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 -F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 -P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 -S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 -W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 -Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 -V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 -X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 -"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] -assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() - -ENCODING_DFS = { - "BLOSUM62": BLOSUM62_MATRIX, - "one-hot": pandas.DataFrame([ - [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] - for j in range(len(AMINO_ACIDS)) - ], index=AMINO_ACIDS, columns=AMINO_ACIDS) -} - - -def available_vector_encodings(): - """ - Return list of supported amino acid vector encodings. - - Returns - ------- - list of string - - """ - return list(ENCODING_DFS) - - -def vector_encoding_length(name): - """ - Return the length of the given vector encoding. - - Parameters - ---------- - name : string - - Returns - ------- - int - """ - return ENCODING_DFS[name].shape[1] - - -def index_encoding(sequences, letter_to_index_dict): - """ - Given a sequence of n strings all of length k, return a k * n array where - the (i, j)th element is letter_to_index_dict[sequence[i][j]]. - - Parameters - ---------- - sequences : list of length n of strings of length k - letter_to_index_dict : dict : string -> int - - Returns - ------- - numpy.array of integers with shape (k, n) - """ - df = pandas.DataFrame(iter(s) for s in sequences) - result = df.replace(letter_to_index_dict) - return result.values - - -def fixed_vectors_encoding(sequences, letter_to_vector_function): - """ - Given a sequence of n strings all of length k, return a n * k * m array where - the (i, j)th element is letter_to_vector_function(sequence[i][j]). - - Parameters - ---------- - sequences : list of length n of strings of length k - letter_to_vector_function : function : string -> vector of length m - - Returns - ------- - numpy.array of integers with shape (n, k, m) - """ - arr = numpy.array([list(s) for s in sequences]) - result = numpy.vectorize( - letter_to_vector_function, signature='()->(n)')(arr) - return result - class EncodableSequences(object): """ @@ -198,7 +94,7 @@ class EncodableSequences(object): max_length=max_length) for sequence in self.sequences ] - self.encoding_cache[cache_key] = index_encoding( + self.encoding_cache[cache_key] = amino_acid.index_encoding( fixed_length_sequences, amino_acid.AMINO_ACID_INDEX) return self.encoding_cache[cache_key] @@ -242,9 +138,9 @@ class EncodableSequences(object): max_length=max_length) for sequence in self.sequences ] - result = fixed_vectors_encoding( + result = amino_acid.fixed_vectors_encoding( fixed_length_sequences, - ENCODING_DFS[vector_encoding_name].loc.__getitem__) + amino_acid.ENCODING_DFS[vector_encoding_name].loc.__getitem__) assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] diff --git a/mhcflurry/percent_rank_transform.py b/mhcflurry/percent_rank_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..80c3240965c9229825b7ff436d729a3b5241f979 --- /dev/null +++ b/mhcflurry/percent_rank_transform.py @@ -0,0 +1,77 @@ +import numpy +import pandas + +class PercentRankTransform(object): + """ + Transform arbitrary values into percent ranks. + """ + + def __init__(self): + self.cdf = None + self.bin_edges = None + + def fit(self, values, bins): + """ + Fit the transform using the given values, which are used to + establish percentiles. + """ + assert self.cdf is None + assert self.bin_edges is None + assert len(values) > 0 + (hist, self.bin_edges) = numpy.histogram(values, bins=bins) + self.cdf = numpy.ones(len(hist) + 3) * numpy.nan + self.cdf[0] = 0.0 + self.cdf[1] = 0.0 + self.cdf[-1] = 100.0 + numpy.cumsum(hist * 100.0 / numpy.sum(hist), out=self.cdf[2:-1]) + assert not numpy.isnan(self.cdf).any() + + def transform(self, values): + """ + Return percent ranks (range [0, 100]) for the given values. + """ + assert self.cdf is not None + assert self.bin_edges is not None + indices = numpy.searchsorted(self.bin_edges, values) + result = self.cdf[indices] + assert len(result) == len(values) + return result + + def to_series(self): + """ + Serialize the fit to a pandas.Series. + + The index on the series gives the bin edges and the valeus give the CDF. + + Returns + ------- + pandas.Series + + """ + return pandas.Series( + self.cdf, index=[numpy.nan] + list(self.bin_edges) + [numpy.nan]) + + @staticmethod + def from_series(series): + """ + Deseralize a PercentRankTransform the given pandas.Series, as returned + by `to_series()`. + + Parameters + ---------- + series : pandas.Series + + Returns + ------- + PercentRankTransform + + """ + result = PercentRankTransform() + result.cdf = series.values + result.bin_edges = series.index.values[1:-1] + return result + + + + + diff --git a/test/test_encodable_sequences.py b/test/test_amino_acid.py similarity index 84% rename from test/test_encodable_sequences.py rename to test/test_amino_acid.py index cc47290a08eb05e3aa8f282f7cefbe57af669b5a..26c50b7bfc6e47c91d42bcd7d0747cd2581d5011 100644 --- a/test/test_encodable_sequences.py +++ b/test/test_amino_acid.py @@ -1,4 +1,4 @@ -from mhcflurry import encodable_sequences +from mhcflurry import amino_acid from nose.tools import eq_ from numpy.testing import assert_equal import numpy @@ -11,7 +11,7 @@ letter_to_index_dict = { def test_index_and_one_hot_encoding(): - index_encoding = encodable_sequences.index_encoding( + index_encoding = amino_acid.index_encoding( ["AAAA", "ABCA"], letter_to_index_dict) assert_equal( index_encoding, @@ -19,7 +19,7 @@ def test_index_and_one_hot_encoding(): [0, 0, 0, 0], [0, 1, 2, 0], ]) - one_hot = encodable_sequences.fixed_vectors_encoding( + one_hot = amino_acid.fixed_vectors_encoding( index_encoding, { 0: numpy.array([1, 0, 0]), diff --git a/test/test_percent_rank_transform.py b/test/test_percent_rank_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..e30aa7bb162b6611facfd9539043d9409dcd0b06 --- /dev/null +++ b/test/test_percent_rank_transform.py @@ -0,0 +1,24 @@ +import numpy + +from mhcflurry.percent_rank_transform import PercentRankTransform + +from numpy.testing import assert_allclose, assert_equal + + +def test_percent_rank_transform(): + model = PercentRankTransform() + model.fit(numpy.arange(1000), bins=100) + assert_allclose( + model.transform([-2, 0, 50, 100, 2000]), + [0.0, 0.0, 5.0, 10.0, 100.0], + err_msg=str(model.__dict__)) + + model2 = PercentRankTransform.from_series(model.to_series()) + assert_allclose( + model2.transform([-2, 0, 50, 100, 2000]), + [0.0, 0.0, 5.0, 10.0, 100.0], + err_msg=str(model.__dict__)) + + assert_equal(model.cdf, model2.cdf) + assert_equal(model.bin_edges, model2.bin_edges) + diff --git a/test/test_train_allele_specific_models_command.py b/test/test_train_allele_specific_models_command.py index 5c3fbcfc961d1e2690c6dc46ca79262b11fe3451..f08963a1ebf8eeffb1c904cb90d65fbd9ae70b8e 100644 --- a/test/test_train_allele_specific_models_command.py +++ b/test/test_train_allele_specific_models_command.py @@ -21,6 +21,7 @@ HYPERPARAMETERS = [ "random_negative_rate": 0.0, "random_negative_constant": 25, + "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": False, "kmer_size": 15, "batch_normalization": False, @@ -50,29 +51,33 @@ HYPERPARAMETERS = [ def test_run(): - try: - models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") - hyperparameters_filename = os.path.join( - models_dir, "hyperparameters.yaml") - with open(hyperparameters_filename, "w") as fd: - json.dump(HYPERPARAMETERS, fd) + models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") + hyperparameters_filename = os.path.join( + models_dir, "hyperparameters.yaml") + with open(hyperparameters_filename, "w") as fd: + json.dump(HYPERPARAMETERS, fd) - args = [ - "--data", get_path("data_curated", "curated_training_data.csv.bz2"), - "--hyperparameters", hyperparameters_filename, - "--min-measurements-per-allele", "9000", - "--out-models-dir", models_dir, - ] - print("Running with args: %s" % args) - train_allele_specific_models_command.run(args) + args = [ + "--data", get_path("data_curated", "curated_training_data.csv.bz2"), + "--hyperparameters", hyperparameters_filename, + "--min-measurements-per-allele", "9000", + "--out-models-dir", models_dir, + "--percent-rank-calibration-num-peptides-per-length", "1000", + ] + print("Running with args: %s" % args) + train_allele_specific_models_command.run(args) - result = Class1AffinityPredictor.load(models_dir) - predictions = result.predict( + result = Class1AffinityPredictor.load(models_dir) + predictions = result.predict( + peptides=["SLYNTVATL"], + alleles=["HLA-A*02:01"]) + assert_equal(predictions.shape, (1,)) + assert_array_less(predictions, 500) + df = result.predict_to_dataframe( peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) - assert_equal(predictions.shape, (1,)) - assert_array_less(predictions, 500) + print(df) + assert "prediction_percentile" in df.columns - finally: - print("Deleting: %s" % models_dir) - shutil.rmtree(models_dir) + print("Deleting: %s" % models_dir) + shutil.rmtree(models_dir)