diff --git a/mhcflurry/predict.py b/mhcflurry/predict.py index 0f432cf082e3b48ed29df056720cc498b68859c6..8b886b4e3546d6285bd210efadbcc57bb19a558a 100644 --- a/mhcflurry/predict.py +++ b/mhcflurry/predict.py @@ -64,3 +64,4 @@ def predict(alleles, peptides, loaders=None): result_dict["Peptide"].append(peptides[i]) result_dict["Prediction"].append(ic50) return pd.DataFrame(result_dict) + diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py new file mode 100644 index 0000000000000000000000000000000000000000..dec8be97fe312e24834fd556fa40eea0794ff0e3 --- /dev/null +++ b/mhcflurry/predict_command.py @@ -0,0 +1,160 @@ +# Copyright (c) 2016. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Run MHCflurry predictor on specified peptide/allele pairs. + +Examples: + +Write a CSV file containing the contents of INPUT.csv plus an +additional column giving MHCflurry binding affinity predictions: + + mhcflurry-predict INPUT.csv --out RESULT.csv + +The input CSV file is expected to contain columns 'allele' and 'peptide'. +The predictions are written to a column called 'mhcflurry_prediction'. +These default column names may be changed with the --allele-column, +--peptide-column, and --prediction-column options. + +If --out is not specified, results are writtent to standard out. + +You can also run on alleles and peptides specified on the commandline, in +which case predictions are written for all combinations of alleles and +peptides: + + mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL +''' +from __future__ import ( + print_function, + division, + absolute_import, +) +import sys +import argparse +import logging +import pandas +import itertools + +from .downloads import get_path +from . import class1_allele_specific + +parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + +parser.add_argument( + "input", + metavar="FILE.csv", + nargs="?", + help="Input CSV") + +parser.add_argument( + "--out", + metavar="FILE.csv", + help="Output CSV") + +parser.add_argument( + "--alleles", + metavar="ALLELE", + nargs="+", + help="Alleles to predict (exclusive with --input)") + +parser.add_argument( + "--peptides", + metavar="PEPTIDE", + nargs="+", + help="Peptides to predict (exclusive with --input)") + +parser.add_argument( + "--allele-column", + metavar="NAME", + default="allele", + help="Input column name for alleles. Default: '%(default)s'") + +parser.add_argument( + "--peptide-column", + metavar="NAME", + default="peptide", + help="Input column name for peptides. Default: '%(default)s'") + +parser.add_argument( + "--prediction-column", + metavar="NAME", + default="mhcflurry_prediction", + help="Output column name for predictions. Default: '%(default)s'") + +parser.add_argument( + "--models-class1-allele-specific-single", + metavar="DIR", + default=get_path("models_class1_allele_specific_single"), + help="Directory containing class1 allele specific single models. " + "Default: '%(default)s'") + + +def run(argv=sys.argv[1:]): + args = parser.parse_args(argv) + + if args.input: + if args.alleles or args.peptides: + parser.error( + "If an input file is specified, do not specify --alleles " + "or --peptides") + df = pandas.read_csv(args.input) + print("Read input CSV with %d rows, columns are: %s" % ( + len(df), ", ".join(df.columns))) + for col in [args.allele_column, args.peptide_column]: + if col not in df.columns: + raise ValueError( + "No such column '%s' in CSV. Columns are: %s" % ( + col, ", ".join(["'%s'" % c for c in df.columns]))) + else: + if not args.alleles or not args.peptides: + parser.error( + "Specify either an input CSV file or both the " + "--alleles and --peptides arguments") + + pairs = list(itertools.product(args.alleles, args.peptides)) + df = pandas.DataFrame({ + "allele": [p[0] for p in pairs], + "peptide": [p[1] for p in pairs], + }) + print("Predicting for %d alleles and %d peptides = %d predictions" % ( + len(args.alleles), len(args.peptides), len(df))) + + class1_allele_specific_loader = ( + class1_allele_specific.load.Class1AlleleSpecificPredictorLoader( + args.models_class1_allele_specific_single)) + + predictions = {} # allele -> peptide -> value + for (allele, sub_df) in df.groupby(args.allele_column): + print(sub_df) + logging.info("Running %d predictions for allele %s" % ( + len(sub_df), allele)) + model = class1_allele_specific_loader.from_allele_name(allele) + peptides = sub_df[args.peptide_column].values + predictions[allele] = dict( + (peptide, prediction) + for (peptide, prediction) + in zip(peptides, model.predict(peptides))) + + logging.info("Collecting result") + df[args.prediction_column] = [ + predictions[row[args.allele_column]][row[args.peptide_column]] + for (_, row) in df.iterrows() + ] + + if args.out: + df.to_csv(args.out, index=False) + print("Wrote: %s" % args.out) + else: + df.to_csv(sys.stdout, index=False) diff --git a/setup.py b/setup.py index 4583f51880513a6119a46427d05260b9e24e393b..796b1ccb4d4fb5b3c56cccf4192609217d79a46a 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ if __name__ == '__main__': entry_points={ 'console_scripts': [ 'mhcflurry-downloads = mhcflurry.downloads_command:run', + 'mhcflurry-predict = mhcflurry.predict_command:run', 'mhcflurry-class1-allele-specific-cv-and-train = ' 'mhcflurry.class1_allele_specific.cv_and_train_command:run' ] diff --git a/test/test_predict_command.py b/test/test_predict_command.py new file mode 100644 index 0000000000000000000000000000000000000000..7cb64d27b753b2675f632006e0b46011d52263e8 --- /dev/null +++ b/test/test_predict_command.py @@ -0,0 +1,62 @@ +import tempfile +import os + +import pandas +from numpy.testing import assert_equal + +from mhcflurry import predict_command + +TEST_CSV = ''' +Allele,Peptide,Experiment +HLA-A0201,SYNFEKKL,17 +HLA-B4403,AAAAAAAAA,17 +HLA-B4403,PPPPPPPP,18 +'''.strip() + + +def test_csv(): + args = ["--allele-column", "Allele", "--peptide-column", "Peptide"] + deletes = [] + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as fd: + fd.write(TEST_CSV.encode()) + deletes.append(fd.name) + fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") + deletes.append(fd_out.name) + full_args = [fd.name] + args + ["--out", fd_out.name] + print("Running with args: %s" % full_args) + predict_command.run(full_args) + result = pandas.read_csv(fd_out.name) + print(result) + finally: + for delete in deletes: + os.unlink(delete) + + assert_equal(result.shape, (3, 4)) + + +def test_no_csv(): + args = [ + "--alleles", "HLA-A0201", "H-2Kb", + "--peptides", "SIINFEKL", "DENDREKLLL", "PICKLE", + "--prediction-column", "prediction", + ] + + deletes = [] + try: + fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") + deletes.append(fd_out.name) + full_args = args + ["--out", fd_out.name] + print("Running with args: %s" % full_args) + predict_command.run(full_args) + result = pandas.read_csv(fd_out.name) + print(result) + finally: + for delete in deletes: + os.unlink(delete) + + assert_equal(result.shape, (6, 3)) + sub_result1 = result.ix[result.peptide == "SIINFEKL"].set_index("allele") + assert ( + sub_result1.ix["H-2Kb"].prediction < + sub_result1.ix["HLA-A0201"].prediction)