diff --git a/downloads-generation/models_class1_pan_variants/GENERATE.sh b/downloads-generation/models_class1_pan_variants/GENERATE.sh index 3e526c0597b18313ed98e79961bc4ece5e783f36..bbbdb94b4b793922cbad682a9c5ea50ef7455e9b 100755 --- a/downloads-generation/models_class1_pan_variants/GENERATE.sh +++ b/downloads-generation/models_class1_pan_variants/GENERATE.sh @@ -62,15 +62,13 @@ if [ "$2" != "continue-incomplete" ] then cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.py . - cp $SCRIPT_DIR/normalize_allele_names.py . python generate_hyperparameters.production.py > hyperparameters.production.yaml python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml python generate_hyperparameters.py hyperparameters.production.yaml compact_peptide > hyperparameters.compact_peptide.yaml - python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv fi -for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain compact_peptide +for kind in 34mer_sequence single_hidden_no_pretrain no_pretrain compact_peptide no_additional_ms ms_only do CONTINUE_INCOMPLETE_ARGS="" if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ] @@ -83,12 +81,22 @@ do HYPERPARAMETERS=hyperparameters.$kind.yaml if [ "$kind" == "34mer_sequence" ] then - ALLELE_SEQUENCES=allele_sequences.34mer.csv + ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.no_differentiation.csv" HYPERPARAMETERS=hyperparameters.production.yaml fi + TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" + if [ "$kind" == "no_additional_ms" ] + then + TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.no_additional_ms.csv.bz2" + fi + if [ "$kind" == "ms_only" ] + then + TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.mass_spec.csv.bz2" + fi + mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + --data "$TRAINING_DATA" \ --allele-sequences "$ALLELE_SEQUENCES" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ @@ -101,7 +109,7 @@ done echo "Done training. Beginning model selection." -for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence compact_peptide +for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence compact_peptide no_additional_ms ms_only do MODELS_DIR="models.unselected.${kind}" mhcflurry-class1-select-pan-allele-models \ diff --git a/downloads-generation/models_class1_pan_variants/normalize_allele_names.py b/downloads-generation/models_class1_pan_variants/normalize_allele_names.py deleted file mode 100644 index 0f0f320522c2eb8d99247b289ebcb90a7942238c..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_pan_variants/normalize_allele_names.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Normalize MHC allele names -""" - -from sys import argv -import os -import pandas -import mhcnames -import argparse - - -def normalize(s, disallowed=["MIC", "HFE"]): - if any(item in s for item in disallowed): - return None - try: - return mhcnames.normalize_allele_name(s) - except: - while s: - s = ":".join(s.split(":")[:-1]) - try: - return mhcnames.normalize_allele_name(s) - except: - pass - return None - - -parser = argparse.ArgumentParser(usage=__doc__) -parser.add_argument("input_csv") -parser.add_argument("--out", help="CSV output") - -args = parser.parse_args(argv[1:]) - -df = pandas.read_csv(args.input_csv) -print("Read df with shape", df.shape) -df["allele"] = df["allele"].map(normalize) -df = df.loc[~df.allele.isnull()] -print("Done normalizing. After removing unparseable names, shape is", df.shape) -df = df.drop_duplicates("allele") -print("After dropping duplicates", df.shape) -df.to_csv(args.out, index=False) -print("Wrote", os.path.abspath(args.out)) diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index a9eb33bdf2f044447002f4ed8c83533e8c4f9e97..d527f46e8fb6add5a82688473f6d32a724970d9b 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -86,7 +86,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191226.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20200101.tar.bz2 default: true # Older downloads