diff --git a/README.md b/README.md index a4373e2731df3d5308211f075d4dc782e24741fa..e47650dd0155e75e90d0a0f505843c89ec6dfc9e 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ mhctools \ --mhc-predictor mhcflurry \ --input-fasta-file INPUT.fasta \ --mhc-alleles A02:01,A03:01 \ + --mhc-peptide-lengths 8,9,10,11 \ --extract-subsequences \ --out RESULT.csv ``` diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh index 658460f6f43ba4cd0a132710f44c8fa240b6d450..c35e0862a0b97ddfa20e65e6117c8cdf8917d8da 100755 --- a/downloads-generation/cross_validation_class1/GENERATE.sh +++ b/downloads-generation/cross_validation_class1/GENERATE.sh @@ -1,5 +1,10 @@ #!/bin/bash - +# +# Cross validation using the standard class I models. +# Splits training data into 5 folds (stratifying on allele), trains and tests a +# predictor on each (train, test) fold, and writes a summary CSV giving +# performance for each allele on each fold. +# set -e set -x diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py index dcf366de2f9e4faf2a8fe885d8baa60b85b730bb..7af791c4ccf2bc6e7d4c77cda4d30999d7d40a2a 100644 --- a/downloads-generation/cross_validation_class1/score.py +++ b/downloads-generation/cross_validation_class1/score.py @@ -75,6 +75,8 @@ def run(argv): scores['kind'] = "single" if "single" in prediction_col else "ensemble" scores['train_size'] = allele_df[prediction_col].isnull().sum() scores['test_size'] = len(sub_df) + + # make_scores returns a dict with entries "auc", "f1", "tau" scores.update( make_scores( sub_df.measurement_value, sub_df[prediction_col])) diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py index 3f95337ffeb9dfeedcaa301fc9c8ecd58125b7c7..dd49085fd6818a9a751419edce1aec38cb2f2eaf 100644 --- a/downloads-generation/cross_validation_class1/split_folds.py +++ b/downloads-generation/cross_validation_class1/split_folds.py @@ -6,6 +6,7 @@ import sys from os.path import abspath import pandas +import numpy from sklearn.model_selection import StratifiedKFold parser = argparse.ArgumentParser(usage = __doc__) @@ -68,18 +69,25 @@ def run(argv): else: alleles = list( allele_counts.ix[ - allele_counts > args.min_measurements_per_allele + allele_counts > args.min_measurements_per_allele ].index) - df = df.ix[df.allele.isin(alleles)] + df = df.loc[df.allele.isin(alleles)].copy() print("Potentially subselected by allele to: %s" % str(df.shape)) print("Data has %d alleles: %s" % ( df.allele.nunique(), " ".join(df.allele.unique()))) + print(df.head()) + + # Take log before taking median (in case of even number of samples). + df["measurement_value"] = numpy.log1p(df.measurement_value) df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index() + df["measurement_value"] = numpy.expm1(df.measurement_value) print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape)) + print(df.head()) + if args.subsample: df = df.head(args.subsample) print("Subsampled to: %s" % str(df.shape)) diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index 89982e6a9c0b1f7ea598912270fd428f9a5019c6..f0a3d54aece76755ab92f1a6764243f014f112b4 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Create "curated" training data, which combines an IEDB download with additional +# published data, removes unusable entries, normalizes allele name, and performs +# other filtering and standardization. +# set -e set -x diff --git a/downloads-generation/data_iedb/GENERATE.sh b/downloads-generation/data_iedb/GENERATE.sh index 55156647d17eb21d90e3b136519f151bb3bd5cd9..a6067a36eb10c18ad1b26a17edb38f656dd106ae 100755 --- a/downloads-generation/data_iedb/GENERATE.sh +++ b/downloads-generation/data_iedb/GENERATE.sh @@ -1,5 +1,7 @@ #!/bin/bash - +# +# Download latest MHC I ligand data from IEDB. +# set -e set -x diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh index baf8a3a453804e47296fd0dcc7aff85b9118ae7e..dbda0fe8c12df076e5e8f3b96728eefda51f23c6 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_kim2014/GENERATE.sh @@ -1,5 +1,8 @@ #!/bin/bash - +# +# Download some published MHC I ligand data from a location on Dropbox. +# +# set -e set -x diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index 2e509d2385196e8b95ff831425d3b52d3aa09814..b72334b536cca453300b52257eb8e9c65c7e0dd3 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train standard MHCflurry Class I models. +# Calls mhcflurry-class1-train-allele-specific-models on curated training data +# using the hyperparameters in "hyperparameters.yaml". +# set -e set -x diff --git a/downloads-generation/models_class1_experiments1/GENERATE.sh b/downloads-generation/models_class1_experiments1/GENERATE.sh index 6edd43488d2331a8b7d3b3a875b35942d2fe856c..89921d924e819b8ac41c6e274730182664013e22 100755 --- a/downloads-generation/models_class1_experiments1/GENERATE.sh +++ b/downloads-generation/models_class1_experiments1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train "experimental" models using various hyperparameter combinations. +# This trains models only for a small number of alleles for which we have good +# mass-spec validation data. +# set -e set -x diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index 78b3a854d8ae08624a5ecd47b28ad0d766eb9a32..ffa5cffc2ae71f1e15f3069244a577399e6bfa20 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -80,7 +80,7 @@ X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 """), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() -ENCODING_DFS = { +ENCODING_DATA_FRAMES = { "BLOSUM62": BLOSUM62_MATRIX, "one-hot": pandas.DataFrame([ [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] @@ -98,7 +98,7 @@ def available_vector_encodings(): list of string """ - return list(ENCODING_DFS) + return list(ENCODING_DATA_FRAMES) def vector_encoding_length(name): @@ -113,7 +113,7 @@ def vector_encoding_length(name): ------- int """ - return ENCODING_DFS[name].shape[1] + return ENCODING_DATA_FRAMES[name].shape[1] def index_encoding(sequences, letter_to_index_dict): diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py index ada2e9e54cbf875a019db00f56c92cc2506e78bf..af94cfa05e167d49327a6f9cb5170952a909895d 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py @@ -760,6 +760,19 @@ class Class1AffinityPredictor(object): 'peptide': peptides.sequences, 'allele': alleles, }) + if len(df) == 0: + # No predictions. + logging.warning("Predicting for 0 peptides.") + empty_result = pandas.DataFrame( + columns=[ + 'peptide', + 'allele', + 'prediction', + 'prediction_low', + 'prediction_high' + ]) + return empty_result + df["normalized_allele"] = df.allele.map( mhcnames.normalize_allele_name) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index b98fb862ff4a4c3ffed2dac6d3e4ffef4d8471db..8477938b12ffa19b175b89e05de60b2806fafc24 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -135,7 +135,7 @@ class EncodableSequences(object): max_length=max_length)) result = amino_acid.fixed_vectors_encoding( fixed_length_sequences, - amino_acid.ENCODING_DFS[vector_encoding_name]) + amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key]