From cb347e11e5e75425e1f2638bca3159c7e8567a7f Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Tue, 28 Jul 2015 15:01:07 -0400 Subject: [PATCH] web API should return error on missing alleles, made hardcoded constants from assay combining logic into commandline arguments --- scripts/create-combined-class1-dataset.py | 72 +++++++++++++++---- scripts/mhcflurry-class1-web-server.py | 5 +- .../train-class1-allele-specific-models.py | 7 +- 3 files changed, 69 insertions(+), 15 deletions(-) diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py index cf8bbb62..0395b20c 100755 --- a/scripts/create-combined-class1-dataset.py +++ b/scripts/create-combined-class1-dataset.py @@ -15,6 +15,7 @@ from __future__ import ( from os.path import join import pickle from collections import Counter +import argparse import pandas as pd @@ -29,14 +30,53 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv" OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME) +parser = argparse.ArgumentParser() + +parser.add_argument("--ic50-fraction-tolerance", + default=0.01, + type=float, + help=( + "How much can the IEDB and NetMHCpan IC50 differ and still be" + " considered compatible (as a fraction of the NetMHCpan value)")) + +parser.add_argument("--min-assay-overlap-size", + type=int, + default=1, + help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data") + + +parser.add_argument("--min-assay-fraction-same", + type=float, + help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data", + default=0.9) + +parser.add_argument("--iedb-pickle-path", + default=IEDB_PICKLE_PATH, + help="Path to .pickle file containing dictionary of IEDB assay datasets") + +parser.add_argument("--netmhcpan-csv-path", + default=PETERS_CSV_PATH, + help="Path to CSV with NetMHCpan dataset from 2013 Peters paper") + +parser.add_argument("--output-csv-path", + default=OUTPUT_CSV_PATH, + help="Path to CSV of combined assay results") + +parser.add_argument("--extra-dataset-csv-path", + default=[], + action="append", + help="Additional CSV data source with columns (species, mhc, peptide, meas)") + if __name__ == "__main__": - print("Reading %s..." % IEDB_PICKLE_PATH) - with open(IEDB_PICKLE_PATH, "r'") as f: + args = parser.parse_args() + + print("Reading %s..." % args.iedb_pickle_path) + with open(args.iedb_pickle_path, "r'") as f: iedb_datasets = pickle.load(f) - print("Reading %s..." % PETERS_CSV_PATH) - nielsen_data = pd.read_csv(PETERS_CSV_PATH, sep="\t") - print("Size of 2013 Peters dataset: %d" % len(nielsen_data)) + print("Reading %s..." % args.netmhcpan_csv_path) + nielsen_data = pd.read_csv(args.netmhcpan_csv_path, sep="\t") + print("Size of 2013 NetMHCpan dataset: %d" % len(nielsen_data)) new_allele_counts = Counter() combined_columns = { @@ -47,7 +87,11 @@ if __name__ == "__main__": "meas": list(nielsen_data["meas"]), } - for assay, assay_dataset in sorted(iedb_datasets.items(), key=lambda x: len(x[1])): + all_datasets = { + path: pd.read_csv(path) for path in args.extra_dataset_csv_path + } + all_datasets.update(iedb_datasets) + for assay, assay_dataset in sorted(all_datasets.items(), key=lambda x: len(x[1])): joined = nielsen_data.merge( assay_dataset, left_on=["mhc", "sequence"], @@ -56,25 +100,27 @@ if __name__ == "__main__": if len(joined) == 0: continue - # drop NaN binding values and entries without values in both datasets + # drop NaN binding values and entries without values in both datasets left_missing = joined["meas"].isnull() right_missing = joined["value"].isnull() overlap_filter_mask = ~(left_missing | right_missing) filtered = joined[overlap_filter_mask] n_overlap = len(filtered) - if n_overlap == 0: + + if n_overlap < args.min_assay_overlap_size: continue # let's count what fraction of this IEDB assay is within 1% of the values in the # Nielsen dataset - similar_values = ( - (filtered["value"] - filtered["meas"]).abs() <= (filtered["meas"] / 100.0)) + tolerance = filtered["meas"] * args.ic50_fraction_tolerance + abs_diff = (filtered["value"] - filtered["meas"]).abs() + similar_values = abs_diff <= tolerance fraction_similar = similar_values.mean() print("Assay=%s, count=%d" % (assay, len(assay_dataset))) print(" # entries w/ values in both data sets: %d" % n_overlap) print(" fraction similar binding values=%0.4f" % fraction_similar) new_peptides = joined[left_missing & ~right_missing] - if fraction_similar > 0.9: + if fraction_similar > args.min_assay_fraction_same: print("---") print("\t using assay: %s" % (assay,)) print("---") @@ -96,5 +142,5 @@ if __name__ == "__main__": print("Combined DataFrame size: %d (+%d)" % ( len(combined_df), len(combined_df) - len(nielsen_data))) - print("Writing %s..." % OUTPUT_CSV_PATH) - combined_df.to_csv(OUTPUT_CSV_PATH, index=False) + print("Writing %s..." % args.output_csv_path) + combined_df.to_csv(args.output_csv_path, index=False) diff --git a/scripts/mhcflurry-class1-web-server.py b/scripts/mhcflurry-class1-web-server.py index 0f6dca94..4fe36f34 100755 --- a/scripts/mhcflurry-class1-web-server.py +++ b/scripts/mhcflurry-class1-web-server.py @@ -46,7 +46,10 @@ def get_binding_value(): if alleles_string is None: return "ERROR: no allele given" alleles_list = split_allele_names(alleles_string) - result_df = predict(alleles=alleles_list, peptides=peptides_list) + try: + result_df = predict(alleles=alleles_list, peptides=peptides_list) + except ValueError as e: + return "ERROR: %s" % e.args[0] return result_df.to_csv(sep="\t", index=False, float_format="%0.4f") @get('/alleles') diff --git a/scripts/train-class1-allele-specific-models.py b/scripts/train-class1-allele-specific-models.py index a2f00280..a10e68b1 100755 --- a/scripts/train-class1-allele-specific-models.py +++ b/scripts/train-class1-allele-specific-models.py @@ -81,6 +81,11 @@ parser.add_argument( default=CSV_PATH, help="CSV file with 'mhc', 'peptide', 'peptide_length', 'meas' columns") +parser.add_argument("--min-samples-per-allele", + default=5, + help="Don't train predictors for alleles with fewer samples than this", + type=int) + if __name__ == "__main__": args = parser.parse_args() @@ -124,7 +129,7 @@ if __name__ == "__main__": if exists(path) and not args.overwrite: print("-- already exists, skipping") continue - if n_allele < 10: + if n_allele < args.min_samples_per_allele: print("-- too few data points, skipping") continue model.set_weights(old_weights) -- GitLab