diff --git a/downloads-generation/models_class1_pan_unselected/README.md b/downloads-generation/models_class1_pan_unselected/README.md index add1df33bd2c70557485846ba141668706043fc6..628d14d7ae833ead0f67856586d26b1f2aa015da 100644 --- a/downloads-generation/models_class1_pan_unselected/README.md +++ b/downloads-generation/models_class1_pan_unselected/README.md @@ -1,6 +1,6 @@ # Class I pan-allele models (ensemble) -This download contains trained MHC Class I MHCflurry models. +This download contains trained MHC Class I MHCflurry models before model selection. To generate this download run: diff --git a/downloads-generation/models_class1_pan_unselected/write_validation_data.py b/downloads-generation/models_class1_pan_unselected/write_validation_data.py deleted file mode 100644 index 28ad4a38a7c6953cbd57fbd6480093d06ffd7437..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_unselected/write_validation_data.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Write and summarize model validation data, which is obtained by taking a full -dataset and removing the data used for training. - -""" -import argparse -import sys -from os.path import abspath - -import pandas -import numpy -from sklearn.model_selection import StratifiedKFold - -parser = argparse.ArgumentParser(usage = __doc__) - -parser.add_argument( - "--include", - metavar="INPUT.csv", - nargs="+", - help="Input CSV to include") -parser.add_argument( - "--exclude", - metavar="INPUT.csv", - nargs="+", - help="Input CSV to exclude") -parser.add_argument( - "--out-data", - metavar="RESULT.csv", - help="Output dadta CSV") -parser.add_argument( - "--out-summary", - metavar="RESULT.csv", - help="Output summary CSV") -parser.add_argument( - "--mass-spec-regex", - metavar="REGEX", - default="mass[- ]spec", - help="Regular expression for mass-spec data. Runs on measurement_source col." - "Default: %(default)s.") -parser.add_argument( - "--only-alleles-present-in-exclude", - action="store_true", - default=False, - help="Filter to only alleles that are present in files given by --exclude. " - "Useful for filtering to only alleles supported by a predictor, where the " - "training data for the predictor is given by --exclude.") - - -def run(argv): - args = parser.parse_args(argv) - - dfs = [] - for input in args.include: - df = pandas.read_csv(input) - dfs.append(df) - df = pandas.concat(dfs, ignore_index=True) - print("Loaded data with shape: %s" % str(df.shape)) - del dfs - - df = df.ix[ - (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) - ] - print("Subselected to 8-15mers: %s" % (str(df.shape))) - - if args.exclude: - exclude_dfs = [] - for exclude in args.exclude: - exclude_df = pandas.read_csv(exclude) - exclude_dfs.append(exclude_df) - exclude_df = pandas.concat(exclude_dfs, ignore_index=True) - del exclude_dfs - - df["_key"] = df.allele + "__" + df.peptide - exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide - df["_excluded"] = df._key.isin(exclude_df._key.unique()) - print("Excluding measurements per allele (counts): ") - print(df.groupby("allele")._excluded.sum()) - - print("Excluding measurements per allele (fractions): ") - print(df.groupby("allele")._excluded.mean()) - - df = df.loc[~df._excluded] - del df["_excluded"] - del df["_key"] - - if args.only_alleles_present_in_exclude: - df = df.loc[df.allele.isin(exclude_df.allele.unique())] - - df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex) - df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec" - - if args.out_summary: - summary_df = df.groupby( - ["allele", "measurement_inequality"] - )["measurement_value"].count().unstack().fillna(0).astype(int) - summary_df["total"] = summary_df.sum(1) - summary_df.to_csv(args.out_summary) - print("Wrote: %s" % args.out_summary) - - if args.out_data: - df.to_csv(args.out_data, index=False) - print("Wrote: %s" % args.out_data) - -if __name__ == '__main__': - run(sys.argv[1:])