From 138f917067fcca9a1ec8eb682b1e8c89fe83a02d Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 30 Oct 2019 14:49:37 -0400 Subject: [PATCH] update --- .../allele_sequences/GENERATE.sh | 11 +++-- .../allele_sequences/make_allele_sequences.py | 1 + .../models_class1_pan_variants/GENERATE.sh | 4 +- .../normalize_allele_names.py | 41 +++++++++++++++++++ mhcflurry/downloads.yml | 2 +- 5 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 downloads-generation/models_class1_pan_variants/normalize_allele_names.py diff --git a/downloads-generation/allele_sequences/GENERATE.sh b/downloads-generation/allele_sequences/GENERATE.sh index 94271aa3..47791a8b 100755 --- a/downloads-generation/allele_sequences/GENERATE.sh +++ b/downloads-generation/allele_sequences/GENERATE.sh @@ -81,6 +81,11 @@ time python make_allele_sequences.py \ --differentiate-alleles training_data.alleles.txt \ --out-csv allele_sequences.csv +time python make_allele_sequences.py \ + class1.aligned.fasta \ + --recapitulate-sequences class1_pseudosequences.csv \ + --out-csv allele_sequences.no_differentiation.csv + # Cleanup gzip -f class1.fasta gzip -f class1.aligned.fasta @@ -88,6 +93,6 @@ rm *.fasta cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt -tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * - -echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +echo "Created archive: $RESULT" \ No newline at end of file diff --git a/downloads-generation/allele_sequences/make_allele_sequences.py b/downloads-generation/allele_sequences/make_allele_sequences.py index d984dd34..cfd87efc 100644 --- a/downloads-generation/allele_sequences/make_allele_sequences.py +++ b/downloads-generation/allele_sequences/make_allele_sequences.py @@ -133,6 +133,7 @@ def run(): assert agreement > 0.9 # Add additional positions + additional_positions = [] if args.differentiate_alleles: differentiate_alleles = pandas.read_csv( args.differentiate_alleles).iloc[:,0].values diff --git a/downloads-generation/models_class1_pan_variants/GENERATE.sh b/downloads-generation/models_class1_pan_variants/GENERATE.sh index 48da9927..f9056fef 100755 --- a/downloads-generation/models_class1_pan_variants/GENERATE.sh +++ b/downloads-generation/models_class1_pan_variants/GENERATE.sh @@ -62,9 +62,11 @@ if [ "$2" != "continue-incomplete" ] then cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.py . + cp $SCRIPT_DIR/normalize_allele_names.py . python generate_hyperparameters.production.py > hyperparameters.production.yaml python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml + python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv fi for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence @@ -80,7 +82,7 @@ do HYPERPARAMETERS=hyperparameters.$kind.yaml if [ "$kind" == "34mer_sequence" ] then - ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" + ALLELE_SEQUENCES=allele_sequences.34mer.csv HYPERPARAMETERS=hyperparameters.production.yaml fi diff --git a/downloads-generation/models_class1_pan_variants/normalize_allele_names.py b/downloads-generation/models_class1_pan_variants/normalize_allele_names.py new file mode 100644 index 00000000..0f0f3205 --- /dev/null +++ b/downloads-generation/models_class1_pan_variants/normalize_allele_names.py @@ -0,0 +1,41 @@ +""" +Normalize MHC allele names +""" + +from sys import argv +import os +import pandas +import mhcnames +import argparse + + +def normalize(s, disallowed=["MIC", "HFE"]): + if any(item in s for item in disallowed): + return None + try: + return mhcnames.normalize_allele_name(s) + except: + while s: + s = ":".join(s.split(":")[:-1]) + try: + return mhcnames.normalize_allele_name(s) + except: + pass + return None + + +parser = argparse.ArgumentParser(usage=__doc__) +parser.add_argument("input_csv") +parser.add_argument("--out", help="CSV output") + +args = parser.parse_args(argv[1:]) + +df = pandas.read_csv(args.input_csv) +print("Read df with shape", df.shape) +df["allele"] = df["allele"].map(normalize) +df = df.loc[~df.allele.isnull()] +print("Done normalizing. After removing unparseable names, shape is", df.shape) +df = df.drop_duplicates("allele") +print("After dropping duplicates", df.shape) +df.to_csv(args.out, index=False) +print("Wrote", os.path.abspath(args.out)) diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 29ca3745..0b756f71 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -66,7 +66,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191030.tar.bz2 default: true # Older downloads -- GitLab