From 0f9394e2571bec4da61efd16f6b8bd71583cc8b6 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 4 Dec 2017 13:39:32 -0500 Subject: [PATCH] Address code review comments --- README.md | 1 + .../cross_validation_class1/GENERATE.sh | 7 ++++++- .../cross_validation_class1/score.py | 2 ++ .../cross_validation_class1/split_folds.py | 12 ++++++++++-- downloads-generation/data_curated/GENERATE.sh | 6 +++++- downloads-generation/data_iedb/GENERATE.sh | 4 +++- downloads-generation/data_kim2014/GENERATE.sh | 5 ++++- downloads-generation/models_class1/GENERATE.sh | 6 +++++- .../models_class1_experiments1/GENERATE.sh | 6 +++++- mhcflurry/amino_acid.py | 6 +++--- .../class1_affinity_predictor.py | 13 +++++++++++++ mhcflurry/encodable_sequences.py | 2 +- 12 files changed, 58 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a4373e27..e47650dd 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ mhctools \ --mhc-predictor mhcflurry \ --input-fasta-file INPUT.fasta \ --mhc-alleles A02:01,A03:01 \ + --mhc-peptide-lengths 8,9,10,11 \ --extract-subsequences \ --out RESULT.csv ``` diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh index 658460f6..c35e0862 100755 --- a/downloads-generation/cross_validation_class1/GENERATE.sh +++ b/downloads-generation/cross_validation_class1/GENERATE.sh @@ -1,5 +1,10 @@ #!/bin/bash - +# +# Cross validation using the standard class I models. +# Splits training data into 5 folds (stratifying on allele), trains and tests a +# predictor on each (train, test) fold, and writes a summary CSV giving +# performance for each allele on each fold. +# set -e set -x diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py index dcf366de..7af791c4 100644 --- a/downloads-generation/cross_validation_class1/score.py +++ b/downloads-generation/cross_validation_class1/score.py @@ -75,6 +75,8 @@ def run(argv): scores['kind'] = "single" if "single" in prediction_col else "ensemble" scores['train_size'] = allele_df[prediction_col].isnull().sum() scores['test_size'] = len(sub_df) + + # make_scores returns a dict with entries "auc", "f1", "tau" scores.update( make_scores( sub_df.measurement_value, sub_df[prediction_col])) diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py index 3f95337f..dd49085f 100644 --- a/downloads-generation/cross_validation_class1/split_folds.py +++ b/downloads-generation/cross_validation_class1/split_folds.py @@ -6,6 +6,7 @@ import sys from os.path import abspath import pandas +import numpy from sklearn.model_selection import StratifiedKFold parser = argparse.ArgumentParser(usage = __doc__) @@ -68,18 +69,25 @@ def run(argv): else: alleles = list( allele_counts.ix[ - allele_counts > args.min_measurements_per_allele + allele_counts > args.min_measurements_per_allele ].index) - df = df.ix[df.allele.isin(alleles)] + df = df.loc[df.allele.isin(alleles)].copy() print("Potentially subselected by allele to: %s" % str(df.shape)) print("Data has %d alleles: %s" % ( df.allele.nunique(), " ".join(df.allele.unique()))) + print(df.head()) + + # Take log before taking median (in case of even number of samples). + df["measurement_value"] = numpy.log1p(df.measurement_value) df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index() + df["measurement_value"] = numpy.expm1(df.measurement_value) print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape)) + print(df.head()) + if args.subsample: df = df.head(args.subsample) print("Subsampled to: %s" % str(df.shape)) diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index 89982e6a..f0a3d54a 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Create "curated" training data, which combines an IEDB download with additional +# published data, removes unusable entries, normalizes allele name, and performs +# other filtering and standardization. +# set -e set -x diff --git a/downloads-generation/data_iedb/GENERATE.sh b/downloads-generation/data_iedb/GENERATE.sh index 55156647..a6067a36 100755 --- a/downloads-generation/data_iedb/GENERATE.sh +++ b/downloads-generation/data_iedb/GENERATE.sh @@ -1,5 +1,7 @@ #!/bin/bash - +# +# Download latest MHC I ligand data from IEDB. +# set -e set -x diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh index baf8a3a4..dbda0fe8 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_kim2014/GENERATE.sh @@ -1,5 +1,8 @@ #!/bin/bash - +# +# Download some published MHC I ligand data from a location on Dropbox. +# +# set -e set -x diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index 2e509d23..b72334b5 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train standard MHCflurry Class I models. +# Calls mhcflurry-class1-train-allele-specific-models on curated training data +# using the hyperparameters in "hyperparameters.yaml". +# set -e set -x diff --git a/downloads-generation/models_class1_experiments1/GENERATE.sh b/downloads-generation/models_class1_experiments1/GENERATE.sh index 6edd4348..89921d92 100755 --- a/downloads-generation/models_class1_experiments1/GENERATE.sh +++ b/downloads-generation/models_class1_experiments1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train "experimental" models using various hyperparameter combinations. +# This trains models only for a small number of alleles for which we have good +# mass-spec validation data. +# set -e set -x diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index 78b3a854..ffa5cffc 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -80,7 +80,7 @@ X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 """), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() -ENCODING_DFS = { +ENCODING_DATA_FRAMES = { "BLOSUM62": BLOSUM62_MATRIX, "one-hot": pandas.DataFrame([ [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] @@ -98,7 +98,7 @@ def available_vector_encodings(): list of string """ - return list(ENCODING_DFS) + return list(ENCODING_DATA_FRAMES) def vector_encoding_length(name): @@ -113,7 +113,7 @@ def vector_encoding_length(name): ------- int """ - return ENCODING_DFS[name].shape[1] + return ENCODING_DATA_FRAMES[name].shape[1] def index_encoding(sequences, letter_to_index_dict): diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py index ada2e9e5..af94cfa0 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py @@ -760,6 +760,19 @@ class Class1AffinityPredictor(object): 'peptide': peptides.sequences, 'allele': alleles, }) + if len(df) == 0: + # No predictions. + logging.warning("Predicting for 0 peptides.") + empty_result = pandas.DataFrame( + columns=[ + 'peptide', + 'allele', + 'prediction', + 'prediction_low', + 'prediction_high' + ]) + return empty_result + df["normalized_allele"] = df.allele.map( mhcnames.normalize_allele_name) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index b98fb862..8477938b 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -135,7 +135,7 @@ class EncodableSequences(object): max_length=max_length)) result = amino_acid.fixed_vectors_encoding( fixed_length_sequences, - amino_acid.ENCODING_DFS[vector_encoding_name]) + amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] -- GitLab