From ba9360750a7481cf8345c663cdb7c22b944da7f6 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 18 Jan 2020 17:09:03 -0500 Subject: [PATCH] fix --- downloads-generation/models_class1_cleavage/GENERATE.sh | 2 +- .../models_class1_cleavage/make_train_data.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/downloads-generation/models_class1_cleavage/GENERATE.sh b/downloads-generation/models_class1_cleavage/GENERATE.sh index 730ed339..2a27b160 100755 --- a/downloads-generation/models_class1_cleavage/GENERATE.sh +++ b/downloads-generation/models_class1_cleavage/GENERATE.sh @@ -95,7 +95,7 @@ else --predictions "$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.mhcflurry.combined" \ --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ --ppv-multiplier 100 \ - --hit-multiplier-to-take 1 \ + --hit-multiplier-to-take 2 \ --out "$(pwd)/train_data.csv" bzip2 -f train_data.csv fi diff --git a/downloads-generation/models_class1_cleavage/make_train_data.py b/downloads-generation/models_class1_cleavage/make_train_data.py index 0b3e361c..9200f824 100644 --- a/downloads-generation/models_class1_cleavage/make_train_data.py +++ b/downloads-generation/models_class1_cleavage/make_train_data.py @@ -80,9 +80,7 @@ def load_predictions(dirname, result_df=None, columns=None): manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)] - print("Will load", len(peptides), "peptides and", len(manifest_df), "cols") - - for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)): + for _, row in manifest_df.iterrows(): with open(os.path.join(dirname, row.path), "rb") as fd: value = numpy.load(fd)['arr_0'] if mask is not None: @@ -109,6 +107,9 @@ def run(): print("Subselected to %d monoallelic samples" % hit_df.sample_id.nunique()) hit_df["allele"] = hit_df.hla + hit_df = hit_df.loc[hit_df.allele.str.match("^HLA-[ABC]")] + print("Subselected to %d HLA-A/B/C samples" % hit_df.sample_id.nunique()) + if args.exclude_contig: new_hit_df = hit_df.loc[ hit_df.protein_primary_ensembl_contig.astype(str) != -- GitLab