From 42ba21de05f4d200150c748b0a19ed067f824298 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 20 Feb 2018 15:45:51 -0500 Subject: [PATCH] remove obsolete downloads --- .../cross_validation_class1/GENERATE.sh | 90 ------------- .../cross_validation_class1/README.md | 7 - .../generate_hyperparameters.py | 1 - .../cross_validation_class1/score.py | 103 --------------- .../cross_validation_class1/split_folds.py | 121 ------------------ .../models_class1_experiments1/GENERATE.sh | 67 ---------- .../models_class1_experiments1/README.md | 5 - .../hyperparameters-0local.yaml | 45 ------- .../hyperparameters-0local_noL1.yaml | 45 ------- .../hyperparameters-2local.yaml | 55 -------- .../hyperparameters-dense32.yaml | 50 -------- .../hyperparameters-dense8.yaml | 50 -------- .../hyperparameters-embedding.yaml | 51 -------- .../hyperparameters-noL1.yaml | 50 -------- .../hyperparameters-onehot.yaml | 50 -------- .../hyperparameters-standard.yaml | 1 - .../hyperparameters-widelocal.yaml | 51 -------- .../train_allele_specific_models_command.py | 9 +- 18 files changed, 8 insertions(+), 843 deletions(-) delete mode 100755 downloads-generation/cross_validation_class1/GENERATE.sh delete mode 100644 downloads-generation/cross_validation_class1/README.md delete mode 120000 downloads-generation/cross_validation_class1/generate_hyperparameters.py delete mode 100644 downloads-generation/cross_validation_class1/score.py delete mode 100644 downloads-generation/cross_validation_class1/split_folds.py delete mode 100755 downloads-generation/models_class1_experiments1/GENERATE.sh delete mode 100644 downloads-generation/models_class1_experiments1/README.md delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml delete mode 120000 downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml delete mode 100644 downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh deleted file mode 100755 index e525c18d..00000000 --- a/downloads-generation/cross_validation_class1/GENERATE.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -# -# Cross validation using the standard class I models. -# Splits training data into 5 folds (stratifying on allele), trains and tests a -# predictor on each (train, test) fold, and writes a summary CSV giving -# performance for each allele on each fold. -# -set -e -set -x - -DOWNLOAD_NAME=cross_validation_class1 -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -NFOLDS=5 - -mkdir -p "$SCRATCH_DIR" -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) - -# Log some environment info -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml - -cp $SCRIPT_DIR/split_folds.py . -cp $SCRIPT_DIR/score.py . - -time python split_folds.py \ - "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ - --min-measurements-per-allele 75 \ - --folds $NFOLDS \ - --random-state 1 \ - --output-pattern-test "./test.fold_{}.csv" \ - --output-pattern-train "./train.fold_{}.csv" - -# Kill child processes if parent exits: -trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT - -for fold in $(seq 0 $(expr $NFOLDS - 1)) -do - mhcflurry-class1-train-allele-specific-models \ - --data train.fold_${fold}.csv \ - --hyperparameters hyperparameters.yaml \ - --out-models-dir models.fold_${fold} \ - --min-measurements-per-allele 0 \ - --num-jobs 8 \ - --percent-rank-calibration-num-peptides-per-length 0 \ - 2>&1 | tee -a LOG.train.fold_${fold}.txt & -done -wait - -echo "DONE TRAINING. NOW PREDICTING." - -for fold in $(seq 0 $(expr $NFOLDS - 1)) -do - mhcflurry-predict \ - test.fold_${fold}.csv \ - --models models.fold_${fold} \ - --no-throw \ - --include-individual-model-predictions \ - --out predictions.fold_${fold}.csv & -done -wait - -time python score.py \ - predictions.fold_*.csv \ - --out-combined predictions.combined.csv \ - --out-scores scores.csv \ - --out-summary summary.all.csv - -grep -v single summary.all.csv > summary.ensemble.csv - -cp $SCRIPT_ABSOLUTE_PATH . -for i in $(ls *.txt) -do - bzip2 $i -done -tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * - -echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/cross_validation_class1/README.md b/downloads-generation/cross_validation_class1/README.md deleted file mode 100644 index f7584e32..00000000 --- a/downloads-generation/cross_validation_class1/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Cross validation of standard Class I models - -This download contains cross validation results and intermediate data for -class I allele-specific MHCflurry models. - -This exists to track the exact steps used to generate cross-validation results. -Users will probably not interact with this directly. \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/generate_hyperparameters.py b/downloads-generation/cross_validation_class1/generate_hyperparameters.py deleted file mode 120000 index 5f2599b4..00000000 --- a/downloads-generation/cross_validation_class1/generate_hyperparameters.py +++ /dev/null @@ -1 +0,0 @@ -../models_class1/generate_hyperparameters.py \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py deleted file mode 100644 index 7af791c4..00000000 --- a/downloads-generation/cross_validation_class1/score.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Scoring script for cross-validation. -""" -import argparse -import sys -import collections - -import pandas -from mhcflurry.scoring import make_scores - - -parser = argparse.ArgumentParser(usage = __doc__) - -parser.add_argument( - "input", metavar="INPUT.csv", help="Input CSV", nargs="+") - -parser.add_argument( - "--out-scores", - metavar="RESULT.csv") - -parser.add_argument( - "--out-combined", - metavar="COMBINED.csv") - -parser.add_argument( - "--out-summary", - metavar="RESULT.csv") - -def run(argv): - args = parser.parse_args(argv) - - df = None - for (i, filename) in enumerate(args.input): - input_df = pandas.read_csv(filename) - assert not input_df.mhcflurry_prediction.isnull().any() - - cols_to_merge = [] - input_df["prediction_%d" % i] = input_df.mhcflurry_prediction - cols_to_merge.append(input_df.columns[-1]) - if 'mhcflurry_model_single_0' in input_df.columns: - input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0 - cols_to_merge.append(input_df.columns[-1]) - - if df is None: - df = input_df[ - ["allele", "peptide", "measurement_value"] + cols_to_merge - ].copy() - else: - df = pandas.merge( - df, - input_df[['allele', 'peptide'] + cols_to_merge], - on=['allele', 'peptide'], - how='outer') - - print("Loaded data:") - print(df.head(5)) - - if args.out_combined: - df.to_csv(args.out_combined, index=False) - print("Wrote: %s" % args.out_combined) - - prediction_cols = [ - c - for c in df.columns - if c.startswith("prediction_") - ] - - scores_rows = [] - for (allele, allele_df) in df.groupby("allele"): - for prediction_col in prediction_cols: - sub_df = allele_df.loc[~allele_df[prediction_col].isnull()] - scores = collections.OrderedDict() - scores['allele'] = allele - scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "") - scores['kind'] = "single" if "single" in prediction_col else "ensemble" - scores['train_size'] = allele_df[prediction_col].isnull().sum() - scores['test_size'] = len(sub_df) - - # make_scores returns a dict with entries "auc", "f1", "tau" - scores.update( - make_scores( - sub_df.measurement_value, sub_df[prediction_col])) - scores_rows.append(scores) - scores_df = pandas.DataFrame(scores_rows) - print(scores_df) - - if args.out_scores: - scores_df.to_csv(args.out_scores, index=False) - print("Wrote: %s" % args.out_scores) - - summary_df = scores_df.groupby(["allele", "kind"])[ - ["train_size", "test_size", "auc", "f1", "tau"] - ].mean().reset_index() - print("Summary:") - print(summary_df) - - if args.out_summary: - summary_df.to_csv(args.out_summary, index=False) - print("Wrote: %s" % args.out_summary) - -if __name__ == '__main__': - run(sys.argv[1:]) - diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py deleted file mode 100644 index dd49085f..00000000 --- a/downloads-generation/cross_validation_class1/split_folds.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Split training data into CV folds. -""" -import argparse -import sys -from os.path import abspath - -import pandas -import numpy -from sklearn.model_selection import StratifiedKFold - -parser = argparse.ArgumentParser(usage = __doc__) - -parser.add_argument( - "input", metavar="INPUT.csv", help="Input CSV") - -parser.add_argument( - "--folds", metavar="N", type=int, default=5) - -parser.add_argument( - "--allele", - nargs="+", - help="Include only the specified allele(s)") - -parser.add_argument( - "--min-measurements-per-allele", - type=int, - metavar="N", - help="Use only alleles with >=N measurements.") - -parser.add_argument( - "--subsample", - type=int, - metavar="N", - help="Subsample to first N rows") - -parser.add_argument( - "--random-state", - metavar="N", - type=int, - help="Specify an int for deterministic splitting") - -parser.add_argument( - "--output-pattern-train", - default="./train.fold_{}.csv", - help="Pattern to use to generate output filename. Default: %(default)s") - -parser.add_argument( - "--output-pattern-test", - default="./test.fold_{}.csv", - help="Pattern to use to generate output filename. Default: %(default)s") - - -def run(argv): - args = parser.parse_args(argv) - - df = pandas.read_csv(args.input) - print("Loaded data with shape: %s" % str(df.shape)) - - df = df.ix[ - (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) - ] - print("Subselected to 8-15mers: %s" % (str(df.shape))) - - allele_counts = df.allele.value_counts() - - if args.allele: - alleles = args.allele - else: - alleles = list( - allele_counts.ix[ - allele_counts > args.min_measurements_per_allele - ].index) - - df = df.loc[df.allele.isin(alleles)].copy() - print("Potentially subselected by allele to: %s" % str(df.shape)) - - print("Data has %d alleles: %s" % ( - df.allele.nunique(), " ".join(df.allele.unique()))) - - print(df.head()) - - # Take log before taking median (in case of even number of samples). - df["measurement_value"] = numpy.log1p(df.measurement_value) - df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index() - df["measurement_value"] = numpy.expm1(df.measurement_value) - print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape)) - - print(df.head()) - - if args.subsample: - df = df.head(args.subsample) - print("Subsampled to: %s" % str(df.shape)) - - kf = StratifiedKFold( - n_splits=args.folds, - shuffle=True, - random_state=args.random_state) - - # Stratify by both allele and binder vs. nonbinder. - df["key"] = [ - "%s_%s" % ( - row.allele, - "binder" if row.measurement_value < 500 else "nonbinder") - for (_, row) in df.iterrows() - ] - - for i, (train, test) in enumerate(kf.split(df, df.key)): - train_filename = args.output_pattern_train.format(i) - test_filename = args.output_pattern_test.format(i) - - df.iloc[train].to_csv(train_filename, index=False) - print("Wrote: %s" % abspath(train_filename)) - - df.iloc[test].to_csv(test_filename, index=False) - print("Wrote: %s" % abspath(test_filename)) - - -if __name__ == '__main__': - run(sys.argv[1:]) - diff --git a/downloads-generation/models_class1_experiments1/GENERATE.sh b/downloads-generation/models_class1_experiments1/GENERATE.sh deleted file mode 100755 index 89921d92..00000000 --- a/downloads-generation/models_class1_experiments1/GENERATE.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# -# Train "experimental" models using various hyperparameter combinations. -# This trains models only for a small number of alleles for which we have good -# mass-spec validation data. -# -set -e -set -x - -DOWNLOAD_NAME=models_class1_experiments1 -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -# Terminate children on exit -trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT - -mkdir -p "$SCRATCH_DIR" -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) - -# Log some environment info -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -ALLELES="HLA-A*01:01 HLA-A*02:01 HLA-A*02:03 HLA-A*02:07 HLA-A*03:01 HLA-A*11:01 HLA-A*24:02 HLA-A*29:02 HLA-A*31:01 HLA-A*68:02 HLA-B*07:02 HLA-B*15:01 HLA-B*35:01 HLA-B*44:02 HLA-B*44:03 HLA-B*51:01 HLA-B*54:01 HLA-B*57:01" - -# Standard architecture on quantitative only -cp $SCRIPT_DIR/hyperparameters-standard.yaml . -mkdir models-standard-quantitative -time mhcflurry-class1-train-allele-specific-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --only-quantitative \ - --hyperparameters hyperparameters-standard.yaml \ - --out-models-dir models-standard-quantitative \ - --percent-rank-calibration-num-peptides-per-length 0 \ - --allele $ALLELES 2>&1 | tee -a LOG.standard.txt & - -# Model variations on qualitative + quantitative -for mod in 0local_noL1 0local 2local widelocal dense8 dense32 noL1 onehot embedding -do - cp $SCRIPT_DIR/hyperparameters-${mod}.yaml . - mkdir models-${mod} - time mhcflurry-class1-train-allele-specific-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --hyperparameters hyperparameters-${mod}.yaml \ - --out-models-dir models-${mod} \ - --percent-rank-calibration-num-peptides-per-length 0 \ - --allele $ALLELES 2>&1 | tee -a LOG.${mod}.txt & -done -wait - -cp $SCRIPT_ABSOLUTE_PATH . -for i in $(ls *.txt) -do - bzip2 $i -done -tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * - -echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/models_class1_experiments1/README.md b/downloads-generation/models_class1_experiments1/README.md deleted file mode 100644 index 932411f0..00000000 --- a/downloads-generation/models_class1_experiments1/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Experimental class I allele-specific models (ensemble) - -This download contains trained MHC Class I allele-specific MHCflurry models -using a variety of experimental architectures. These were generated for a -publication and are not intended for production use. \ No newline at end of file diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml deleted file mode 100644 index e39d96d6..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml +++ /dev/null @@ -1,45 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml deleted file mode 100644 index abe4d296..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml +++ /dev/null @@ -1,45 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.0, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml deleted file mode 100644 index 4ce0eea2..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml +++ /dev/null @@ -1,55 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml deleted file mode 100644 index ef6b334a..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml +++ /dev/null @@ -1,50 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 32 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml deleted file mode 100644 index b16983a8..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml +++ /dev/null @@ -1,50 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 8 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml deleted file mode 100644 index ea30d9eb..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml +++ /dev/null @@ -1,51 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "embedding", -"use_embedding": true, # maintained for backward compatability -"embedding_output_dim": 8, # only used if using embedding -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml deleted file mode 100644 index 25c1942b..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml +++ /dev/null @@ -1,50 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.0, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml deleted file mode 100644 index d6c83f93..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml +++ /dev/null @@ -1,50 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "one-hot", -"use_embedding": false, # maintained for backward compatability -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml deleted file mode 120000 index f32feef1..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml +++ /dev/null @@ -1 +0,0 @@ -../models_class1/hyperparameters.yaml \ No newline at end of file diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml deleted file mode 100644 index 6123c79f..00000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml +++ /dev/null @@ -1,51 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 10, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"embedding_output_dim": 8, # only used if using embedding -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 5 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [ - 16 -], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index 6084e874..c67eed70 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -10,6 +10,7 @@ import traceback import random from functools import partial +import numpy import pandas import yaml from sklearn.metrics.pairwise import cosine_similarity @@ -413,6 +414,8 @@ def train_model( def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0): + df["allele_peptide"] = df.allele + "_" + df.peptide + kf = StratifiedKFold( n_splits=recriprocal_held_out_fraction, shuffle=True, @@ -425,8 +428,12 @@ def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0): "binder" if row.measurement_value <= 500 else "nonbinder") for (_, row) in df.iterrows() ] + (train, test) = next(kf.split(df, df.key)) - return df.iloc[train] + selected_allele_peptides = df.iloc[train].allele_peptide.unique() + result_df = df.allele_peptide.isin(selected_allele_peptides) + del result_df["allele_peptide"] + return result_df if __name__ == '__main__': run() -- GitLab