Skip to content
Snippets Groups Projects
Commit cb347e11 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

web API should return error on missing alleles, made hardcoded constants from...

web API should return error on missing alleles, made hardcoded constants from assay combining logic into commandline arguments
parent d0aa6351
No related branches found
No related tags found
No related merge requests found
...@@ -15,6 +15,7 @@ from __future__ import ( ...@@ -15,6 +15,7 @@ from __future__ import (
from os.path import join from os.path import join
import pickle import pickle
from collections import Counter from collections import Counter
import argparse
import pandas as pd import pandas as pd
...@@ -29,14 +30,53 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) ...@@ -29,14 +30,53 @@ PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv" OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv"
OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME) OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME)
parser = argparse.ArgumentParser()
parser.add_argument("--ic50-fraction-tolerance",
default=0.01,
type=float,
help=(
"How much can the IEDB and NetMHCpan IC50 differ and still be"
" considered compatible (as a fraction of the NetMHCpan value)"))
parser.add_argument("--min-assay-overlap-size",
type=int,
default=1,
help="Minimum number of entries overlapping between IEDB assay and NetMHCpan data")
parser.add_argument("--min-assay-fraction-same",
type=float,
help="Minimum fraction of peptides whose IC50 values agree with the NetMHCpan data",
default=0.9)
parser.add_argument("--iedb-pickle-path",
default=IEDB_PICKLE_PATH,
help="Path to .pickle file containing dictionary of IEDB assay datasets")
parser.add_argument("--netmhcpan-csv-path",
default=PETERS_CSV_PATH,
help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
parser.add_argument("--output-csv-path",
default=OUTPUT_CSV_PATH,
help="Path to CSV of combined assay results")
parser.add_argument("--extra-dataset-csv-path",
default=[],
action="append",
help="Additional CSV data source with columns (species, mhc, peptide, meas)")
if __name__ == "__main__": if __name__ == "__main__":
print("Reading %s..." % IEDB_PICKLE_PATH) args = parser.parse_args()
with open(IEDB_PICKLE_PATH, "r'") as f:
print("Reading %s..." % args.iedb_pickle_path)
with open(args.iedb_pickle_path, "r'") as f:
iedb_datasets = pickle.load(f) iedb_datasets = pickle.load(f)
print("Reading %s..." % PETERS_CSV_PATH) print("Reading %s..." % args.netmhcpan_csv_path)
nielsen_data = pd.read_csv(PETERS_CSV_PATH, sep="\t") nielsen_data = pd.read_csv(args.netmhcpan_csv_path, sep="\t")
print("Size of 2013 Peters dataset: %d" % len(nielsen_data)) print("Size of 2013 NetMHCpan dataset: %d" % len(nielsen_data))
new_allele_counts = Counter() new_allele_counts = Counter()
combined_columns = { combined_columns = {
...@@ -47,7 +87,11 @@ if __name__ == "__main__": ...@@ -47,7 +87,11 @@ if __name__ == "__main__":
"meas": list(nielsen_data["meas"]), "meas": list(nielsen_data["meas"]),
} }
for assay, assay_dataset in sorted(iedb_datasets.items(), key=lambda x: len(x[1])): all_datasets = {
path: pd.read_csv(path) for path in args.extra_dataset_csv_path
}
all_datasets.update(iedb_datasets)
for assay, assay_dataset in sorted(all_datasets.items(), key=lambda x: len(x[1])):
joined = nielsen_data.merge( joined = nielsen_data.merge(
assay_dataset, assay_dataset,
left_on=["mhc", "sequence"], left_on=["mhc", "sequence"],
...@@ -56,25 +100,27 @@ if __name__ == "__main__": ...@@ -56,25 +100,27 @@ if __name__ == "__main__":
if len(joined) == 0: if len(joined) == 0:
continue continue
# drop NaN binding values and entries without values in both datasets
# drop NaN binding values and entries without values in both datasets
left_missing = joined["meas"].isnull() left_missing = joined["meas"].isnull()
right_missing = joined["value"].isnull() right_missing = joined["value"].isnull()
overlap_filter_mask = ~(left_missing | right_missing) overlap_filter_mask = ~(left_missing | right_missing)
filtered = joined[overlap_filter_mask] filtered = joined[overlap_filter_mask]
n_overlap = len(filtered) n_overlap = len(filtered)
if n_overlap == 0:
if n_overlap < args.min_assay_overlap_size:
continue continue
# let's count what fraction of this IEDB assay is within 1% of the values in the # let's count what fraction of this IEDB assay is within 1% of the values in the
# Nielsen dataset # Nielsen dataset
similar_values = ( tolerance = filtered["meas"] * args.ic50_fraction_tolerance
(filtered["value"] - filtered["meas"]).abs() <= (filtered["meas"] / 100.0)) abs_diff = (filtered["value"] - filtered["meas"]).abs()
similar_values = abs_diff <= tolerance
fraction_similar = similar_values.mean() fraction_similar = similar_values.mean()
print("Assay=%s, count=%d" % (assay, len(assay_dataset))) print("Assay=%s, count=%d" % (assay, len(assay_dataset)))
print(" # entries w/ values in both data sets: %d" % n_overlap) print(" # entries w/ values in both data sets: %d" % n_overlap)
print(" fraction similar binding values=%0.4f" % fraction_similar) print(" fraction similar binding values=%0.4f" % fraction_similar)
new_peptides = joined[left_missing & ~right_missing] new_peptides = joined[left_missing & ~right_missing]
if fraction_similar > 0.9: if fraction_similar > args.min_assay_fraction_same:
print("---") print("---")
print("\t using assay: %s" % (assay,)) print("\t using assay: %s" % (assay,))
print("---") print("---")
...@@ -96,5 +142,5 @@ if __name__ == "__main__": ...@@ -96,5 +142,5 @@ if __name__ == "__main__":
print("Combined DataFrame size: %d (+%d)" % ( print("Combined DataFrame size: %d (+%d)" % (
len(combined_df), len(combined_df),
len(combined_df) - len(nielsen_data))) len(combined_df) - len(nielsen_data)))
print("Writing %s..." % OUTPUT_CSV_PATH) print("Writing %s..." % args.output_csv_path)
combined_df.to_csv(OUTPUT_CSV_PATH, index=False) combined_df.to_csv(args.output_csv_path, index=False)
...@@ -46,7 +46,10 @@ def get_binding_value(): ...@@ -46,7 +46,10 @@ def get_binding_value():
if alleles_string is None: if alleles_string is None:
return "ERROR: no allele given" return "ERROR: no allele given"
alleles_list = split_allele_names(alleles_string) alleles_list = split_allele_names(alleles_string)
result_df = predict(alleles=alleles_list, peptides=peptides_list) try:
result_df = predict(alleles=alleles_list, peptides=peptides_list)
except ValueError as e:
return "ERROR: %s" % e.args[0]
return result_df.to_csv(sep="\t", index=False, float_format="%0.4f") return result_df.to_csv(sep="\t", index=False, float_format="%0.4f")
@get('/alleles') @get('/alleles')
......
...@@ -81,6 +81,11 @@ parser.add_argument( ...@@ -81,6 +81,11 @@ parser.add_argument(
default=CSV_PATH, default=CSV_PATH,
help="CSV file with 'mhc', 'peptide', 'peptide_length', 'meas' columns") help="CSV file with 'mhc', 'peptide', 'peptide_length', 'meas' columns")
parser.add_argument("--min-samples-per-allele",
default=5,
help="Don't train predictors for alleles with fewer samples than this",
type=int)
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
...@@ -124,7 +129,7 @@ if __name__ == "__main__": ...@@ -124,7 +129,7 @@ if __name__ == "__main__":
if exists(path) and not args.overwrite: if exists(path) and not args.overwrite:
print("-- already exists, skipping") print("-- already exists, skipping")
continue continue
if n_allele < 10: if n_allele < args.min_samples_per_allele:
print("-- too few data points, skipping") print("-- too few data points, skipping")
continue continue
model.set_weights(old_weights) model.set_weights(old_weights)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment