Skip to content
Snippets Groups Projects
Commit 06b6ecc6 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent e65d7a79
No related branches found
No related tags found
No related merge requests found
# Class I pan-allele models (ensemble)
This download contains trained MHC Class I MHCflurry models.
This download contains trained MHC Class I MHCflurry models before model selection.
To generate this download run:
......
"""
Write and summarize model validation data, which is obtained by taking a full
dataset and removing the data used for training.
"""
import argparse
import sys
from os.path import abspath
import pandas
import numpy
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"--include",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to include")
parser.add_argument(
"--exclude",
metavar="INPUT.csv",
nargs="+",
help="Input CSV to exclude")
parser.add_argument(
"--out-data",
metavar="RESULT.csv",
help="Output dadta CSV")
parser.add_argument(
"--out-summary",
metavar="RESULT.csv",
help="Output summary CSV")
parser.add_argument(
"--mass-spec-regex",
metavar="REGEX",
default="mass[- ]spec",
help="Regular expression for mass-spec data. Runs on measurement_source col."
"Default: %(default)s.")
parser.add_argument(
"--only-alleles-present-in-exclude",
action="store_true",
default=False,
help="Filter to only alleles that are present in files given by --exclude. "
"Useful for filtering to only alleles supported by a predictor, where the "
"training data for the predictor is given by --exclude.")
def run(argv):
args = parser.parse_args(argv)
dfs = []
for input in args.include:
df = pandas.read_csv(input)
dfs.append(df)
df = pandas.concat(dfs, ignore_index=True)
print("Loaded data with shape: %s" % str(df.shape))
del dfs
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
if args.exclude:
exclude_dfs = []
for exclude in args.exclude:
exclude_df = pandas.read_csv(exclude)
exclude_dfs.append(exclude_df)
exclude_df = pandas.concat(exclude_dfs, ignore_index=True)
del exclude_dfs
df["_key"] = df.allele + "__" + df.peptide
exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide
df["_excluded"] = df._key.isin(exclude_df._key.unique())
print("Excluding measurements per allele (counts): ")
print(df.groupby("allele")._excluded.sum())
print("Excluding measurements per allele (fractions): ")
print(df.groupby("allele")._excluded.mean())
df = df.loc[~df._excluded]
del df["_excluded"]
del df["_key"]
if args.only_alleles_present_in_exclude:
df = df.loc[df.allele.isin(exclude_df.allele.unique())]
df["mass_spec"] = df.measurement_source.str.contains(args.mass_spec_regex)
df.loc[df.mass_spec , "measurement_inequality"] = "mass_spec"
if args.out_summary:
summary_df = df.groupby(
["allele", "measurement_inequality"]
)["measurement_value"].count().unstack().fillna(0).astype(int)
summary_df["total"] = summary_df.sum(1)
summary_df.to_csv(args.out_summary)
print("Wrote: %s" % args.out_summary)
if args.out_data:
df.to_csv(args.out_data, index=False)
print("Wrote: %s" % args.out_data)
if __name__ == '__main__':
run(sys.argv[1:])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment