diff --git a/.travis.yml b/.travis.yml index 079d7e82b72b642a597589aa7c7aa589b2d8a24a..159fd2d88ca6c63d76aae7ffcc9e0f1ef603f73b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,8 @@ install: - pip install coveralls env: global: - - PYTHONHASHSEED=0 + - PYTHONHASHSEED=0 + - MKL_THREADING_LAYER=GNU # for theano matrix: - KERAS_BACKEND=theano - KERAS_BACKEND=tensorflow diff --git a/README.md b/README.md index ae8b7355a0203fc302ff71e11adef6b051cc3bc6..cc0c93af37fac147f6a30cb630d712703906a778 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[](https://travis-ci.org/hammerlab/mhcflurry) [](https://coveralls.io/github/hammerlab/mhcflurry?branch=master) +[](https://travis-ci.org/openvax/mhcflurry) # mhcflurry [MHC I](https://en.wikipedia.org/wiki/MHC_class_I) ligand @@ -34,9 +34,9 @@ You can now generate predictions: $ mhcflurry-predict \ --alleles HLA-A0201 HLA-A0301 \ --peptides SIINFEKL SIINFEKD SIINFEKQ \ - --out /tmp/predictions.csv \ + --out /tmp/predictions.csv Wrote: /tmp/predictions.csv ``` -See the [documentation](http://openvax.github.io/mhcflurry/) for more details. +See the [documentation](http://openvax.github.io/mhcflurry/) for more details. \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index 3dfdf85d46949236d6a6fcd33a897ee201828804..8c1bf9d5c429ffa198fed21a652998f6f5f528e5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -56,7 +56,6 @@ generate: mhcflurry-downloads fetch models_class1 cross_validation_class1 python generate.py \ --out-models-cv-rst _build/_models_cv.rst \ - --out-models-architecture-png _build/_models_architecture.png \ --out-models-info-rst _build/_models_info.rst \ --out-models-supported-alleles-rst _build/_models_supported_alleles.rst diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index 6dc803c81b40657c2be83f8d5106a46654fb1c73..ade306d6f4a781df3080cad53547a5e6eee37f29 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -111,7 +111,7 @@ training data. The data we use for our released predictors can be downloaded wit It looks like this: .. command-output:: - bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" | head -n 3 + bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" | head -n 3 :shell: :nostderr: diff --git a/docs/conf.py b/docs/conf.py index be1d9ae6aa14dab0664790112c8df8957fdd7682..98df4244ca721115c27b62f72aea8c2bf9f6613a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,7 +86,7 @@ author = 'Timothy O\'Donnell' # The short X.Y version. # Added by Tim: reading version from mhcflurry __init__.py as in setup.py -with open('../mhcflurry/__init__.py', 'r') as f: +with open('../mhcflurry/version.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), diff --git a/docs/generate.py b/docs/generate.py index 848d04ce5a15c0d8391a4c1bd81e90b4a5229939..fb39ade21ba133c8775cf6a6d5c06bbdd9db3b60 100644 --- a/docs/generate.py +++ b/docs/generate.py @@ -4,11 +4,14 @@ Generate certain RST files used in documentation. import sys import argparse +import json from textwrap import wrap +from collections import OrderedDict import pypandoc import pandas from keras.utils.vis_utils import plot_model +from tabulate import tabulate from mhcflurry import __version__ from mhcflurry.downloads import get_path @@ -89,18 +92,66 @@ def go(argv): # Architecture information rst if predictor is None: predictor = Class1AffinityPredictor.load(args.class1_models_dir) - network = predictor.neural_networks[0].network() - lines = [] - network.summary(print_fn=lines.append) + + representative_networks = OrderedDict() + for network in predictor.neural_networks: + config = json.dumps(network.hyperparameters) + if config not in representative_networks: + representative_networks[config] = network + + all_hyperparameters = [ + network.hyperparameters for network in representative_networks.values() + ] + hyperparameter_keys = all_hyperparameters[0].keys() + assert all( + hyperparameters.keys() == hyperparameter_keys + for hyperparameters in all_hyperparameters) + + constant_hyperparameter_keys = [ + k for k in hyperparameter_keys + if all([ + hyperparameters[k] == all_hyperparameters[0][k] + for hyperparameters in all_hyperparameters + ]) + ] + constant_hypeparameters = dict( + (key, all_hyperparameters[0][key]) + for key in sorted(constant_hyperparameter_keys) + ) + + def write_hyperparameters(fd, hyperparameters): + rows = [] + for key in sorted(hyperparameters.keys()): + rows.append((key, json.dumps(hyperparameters[key]))) + fd.write("\n") + fd.write( + tabulate(rows, ["Hyperparameter", "Value"], tablefmt="grid")) with open(args.out_models_info_rst, "w") as fd: - fd.write("Layers and parameters summary: ") - fd.write("\n\n::\n\n") - for line in lines: - fd.write(" ") - fd.write(line) + fd.write("Hyperparameters shared by all %d architectures:\n" % + len(representative_networks)) + write_hyperparameters(fd, constant_hypeparameters) + fd.write("\n") + for (i, network) in enumerate(representative_networks.values()): + lines = [] + network.network().summary(print_fn=lines.append) + + fd.write("Architecture %d / %d:\n" % ( + (i + 1, len(representative_networks)))) + fd.write("+" * 40) fd.write("\n") - print("Wrote: %s" % args.out_models_info_rst) + write_hyperparameters( + fd, + dict( + (key, value) + for (key, value) in network.hyperparameters.items() + if key not in constant_hypeparameters)) + fd.write("\n\n::\n\n") + for line in lines: + fd.write(" ") + fd.write(line) + fd.write("\n") + print("Wrote: %s" % args.out_models_info_rst) if args.out_models_cv_rst: # Models cv output diff --git a/docs/models.rst b/docs/models.rst index 4f7dee9edb93dcf686b830795aad79bf93c74bd7..96f2d1243e8975a3d6a3b31d7a88bbdf903680f8 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -1,35 +1,28 @@ Details on the released models =============================== -The released MHCflurry predictor consists of an ensemble of eight models for each -supported allele. Each model in the ensemble was trained on a random 80% sample -of the data for the allele, and the remaining 20% was used for early stopping. -All models use the same architecture. The predictions are taken to be the geometric -mean of the nM binding affinity predictions of the individual models. The script -we run to train these models is in "downloads-generation/models_class1/GENERATE.sh" -in the repository. - -Neural network architecture +The released MHCflurry predictor consists of an ensemble of models for each +supported allele. Each model in the ensemble was trained on a random 90% sample +of the data for the allele, and the remaining data was used for early stopping. +The predictions are taken to be the geometric mean of the nM binding affinity +predictions of the individual models whose predictions fall in the middle 50% of +values for a given prediction. The script we run to train these models is in +"downloads-generation/models_class1/GENERATE.sh" in the repository. + +Neural network architectures ------------------------------------------------------------- -The neural network architecture is quite simple, consisting of a locally -connected layer, a dense layer, and a sigmoid output. - .. include:: /_build/_models_info.rst -Architecture diagram: - -.. image:: /_build/_models_architecture.png - -Cross validation performance -------------------------------------------------------------- +.. Cross validation performance +.. ------------------------------------------------------------- -The accuracy of the MHCflurry downloadable models was estimated using 5-fold cross -validation on the training data. The values shown here are the mean cross validation -scores across folds. +.. The accuracy of the MHCflurry downloadable models was estimated using 5-fold cross +.. validation on the training data. The values shown here are the mean cross validation +.. scores across folds. -The AUC and F1 estimates use a 500 nM cutoff for distinguishing strong-binders -from weak- or non-binders. The Kendall Tau score gives the rank correlation -between the predicted and measured affinities; it uses no cutoff. +.. The AUC and F1 estimates use a 500 nM cutoff for distinguishing strong-binders +.. from weak- or non-binders. The Kendall Tau score gives the rank correlation +.. between the predicted and measured affinities; it uses no cutoff. -.. include:: /_build/_models_cv.rst +.. .. include:: /_build/_models_cv.rst diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index f8a9379984b9fec2ce7a31a41ac60b29bb8c8881..5840db46484f64b430a96fb9ec86694af0328a01 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -65,7 +65,7 @@ We can get the path to this data from Python using `mhcflurry.downloads.get_path .. runblock:: pycon >>> from mhcflurry.downloads import get_path - >>> data_path = get_path("data_curated", "curated_training_data.csv.bz2") + >>> data_path = get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2") >>> data_path Now let's load it with pandas and filter to reasonably-sized peptides: @@ -87,15 +87,16 @@ some models. >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100) >>> new_predictor.fit_allele_specific_predictors( ... n_models=1, - ... architecture_hyperparameters={ + ... architecture_hyperparameters_list=[{ ... "layer_sizes": [16], ... "max_epochs": 5, ... "random_negative_constant": 5, - ... }, + ... }], ... peptides=single_allele_train_data.peptide.values, ... affinities=single_allele_train_data.measurement_value.values, ... allele="HLA-B*57:01") + The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method can be called any number of times on the same instance to build up ensembles of models across alleles. The `architecture_hyperparameters` we specified are diff --git a/docs/requirements.txt b/docs/requirements.txt index a88547504c1d7ede3af6ca6d67010fc0614ade96..af205bbbf1b5613450cffece92b58334dd06ea34 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,3 +7,4 @@ numpydoc pypandoc mhctools pydot +tabulate diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh index c35e0862a0b97ddfa20e65e6117c8cdf8917d8da..e525c18d3ef0969e3f159bb752df2b0b03f71e84 100755 --- a/downloads-generation/cross_validation_class1/GENERATE.sh +++ b/downloads-generation/cross_validation_class1/GENERATE.sh @@ -30,12 +30,13 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME -cp $SCRIPT_DIR/hyperparameters.yaml . +python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml + cp $SCRIPT_DIR/split_folds.py . cp $SCRIPT_DIR/score.py . time python split_folds.py \ - "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ --min-measurements-per-allele 75 \ --folds $NFOLDS \ --random-state 1 \ @@ -52,6 +53,7 @@ do --hyperparameters hyperparameters.yaml \ --out-models-dir models.fold_${fold} \ --min-measurements-per-allele 0 \ + --num-jobs 8 \ --percent-rank-calibration-num-peptides-per-length 0 \ 2>&1 | tee -a LOG.train.fold_${fold}.txt & done diff --git a/downloads-generation/cross_validation_class1/generate_hyperparameters.py b/downloads-generation/cross_validation_class1/generate_hyperparameters.py new file mode 120000 index 0000000000000000000000000000000000000000..5f2599b4b2b4c418d1c296b43c55be87bb4d85ce --- /dev/null +++ b/downloads-generation/cross_validation_class1/generate_hyperparameters.py @@ -0,0 +1 @@ +../models_class1/generate_hyperparameters.py \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/hyperparameters.yaml b/downloads-generation/cross_validation_class1/hyperparameters.yaml deleted file mode 120000 index f32feef1682d757437fe1f0cee2a31c030d7508f..0000000000000000000000000000000000000000 --- a/downloads-generation/cross_validation_class1/hyperparameters.yaml +++ /dev/null @@ -1 +0,0 @@ -../models_class1/hyperparameters.yaml \ No newline at end of file diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index f0a3d54aece76755ab92f1a6764243f014f112b4..11eb52ce42dc8a26e34e2b659e76f70559c35eca 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -30,14 +30,31 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/curate.py . +# No mass-spec data time python curate.py \ --data-iedb \ "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \ --data-kim2014 \ - "$(mhcflurry-downloads path data_kim2014)/bdata.20130222.mhci.public.1.txt" \ - --out-csv curated_training_data.csv + "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ + --out-csv curated_training_data.no_mass_spec.csv + +# With mass-spec data +# Note that we STILL drop mass-spec data from IEDB here, since this data seems +# low-quality. +time python curate.py \ + --data-iedb \ + "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \ + --data-kim2014 \ + "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ + --data-systemhc-atlas \ + "$(mhcflurry-downloads path data_systemhcatlas)/data.csv.bz2" \ + --data-abelin-mass-spec \ + "$(mhcflurry-downloads path data_published)/abelin2017.hits.csv.bz2" \ + --out-csv curated_training_data.with_mass_spec.csv + +bzip2 curated_training_data.no_mass_spec.csv +bzip2 curated_training_data.with_mass_spec.csv -bzip2 curated_training_data.csv cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 49eb75c50cd1827c40ad49c2d38d960101c6c43c..32f4a8540ad3ab39843a97f5f1d9374a7c004786 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -4,9 +4,6 @@ Train single allele models """ import sys import argparse -import json -import os -import pickle import pandas @@ -32,18 +29,41 @@ parser.add_argument( action="append", default=[], help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)") +parser.add_argument( + "--data-systemhc-atlas", + action="append", + default=[], + help="Path to systemhc-atlas-style mass-spec data") +parser.add_argument( + "--data-abelin-mass-spec", + action="append", + default=[], + help="Path to Abelin Immunity 2017 mass-spec hits") +parser.add_argument( + "--include-iedb-mass-spec", + action="store_true", + default=False, + help="Include mass-spec observations in IEDB") + parser.add_argument( "--out-csv", required=True, help="Result file") -QUALITATIVE_TO_AFFINITY = { - "Negative": 50000.0, - "Positive": 100.0, - "Positive-High": 50.0, - "Positive-Intermediate": 500.0, - "Positive-Low": 5000.0, +QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = { + "Negative": (5000.0, ">"), + "Positive": (500.0, "<"), # used for mass-spec hits + "Positive-High": (100.0, "<"), + "Positive-Intermediate": (1000.0, "<"), + "Positive-Low": (5000.0, "<"), } +QUALITATIVE_TO_AFFINITY = dict( + (key, value[0]) for (key, value) + in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items()) +QUALITATIVE_TO_INEQUALITY = dict( + (key, value[1]) for (key, value) + in QUALITATIVE_TO_AFFINITY_AND_INEQUALITY.items()) + EXCLUDE_IEDB_ALLELES = [ "HLA class I", @@ -60,6 +80,7 @@ def load_data_kim2014(filename): True: "quantitative", False: "qualitative", }) + df["measurement_inequality"] = df.inequality df["original_allele"] = df.mhc df["peptide"] = df.sequence df["allele"] = df.mhc.map(normalize_allele_name) @@ -71,7 +92,58 @@ def load_data_kim2014(filename): return df -def load_data_iedb(iedb_csv, include_qualitative=True): +def load_data_systemhc_atlas(filename, min_probability=0.99): + df = pandas.read_csv(filename) + print("Loaded systemhc atlas data: %s" % str(df.shape)) + + df["measurement_source"] = "systemhc-atlas" + df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] + df["measurement_inequality"] = "<" + df["measurement_type"] = "qualitative" + df["original_allele"] = df.top_allele + df["peptide"] = df.search_hit + df["allele"] = df.top_allele.map(normalize_allele_name) + + print("Dropping un-parseable alleles: %s" % ", ".join( + str(x) for x in df.ix[df.allele == "UNKNOWN"]["top_allele"].unique())) + df = df.loc[df.allele != "UNKNOWN"] + print("Systemhc atlas data now: %s" % str(df.shape)) + + print("Dropping data points with probability < %f" % min_probability) + df = df.loc[df.prob >= min_probability] + print("Systemhc atlas data now: %s" % str(df.shape)) + + print("Removing duplicates") + df = df.drop_duplicates(["allele", "peptide"]) + print("Systemhc atlas data now: %s" % str(df.shape)) + + return df + + +def load_data_abelin_mass_spec(filename): + df = pandas.read_csv(filename) + print("Loaded Abelin mass-spec data: %s" % str(df.shape)) + + df["measurement_source"] = "abelin-mass-spec" + df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] + df["measurement_inequality"] = "<" + df["measurement_type"] = "qualitative" + df["original_allele"] = df.allele + df["allele"] = df.original_allele.map(normalize_allele_name) + + print("Dropping un-parseable alleles: %s" % ", ".join( + str(x) for x in df.ix[df.allele == "UNKNOWN"]["allele"].unique())) + df = df.loc[df.allele != "UNKNOWN"] + print("Abelin mass-spec data now: %s" % str(df.shape)) + + print("Removing duplicates") + df = df.drop_duplicates(["allele", "peptide"]) + print("Abelin mass-spec data now: %s" % str(df.shape)) + + return df + + +def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False) print("Loaded iedb data: %s" % str(iedb_df.shape)) @@ -99,24 +171,29 @@ def load_data_iedb(iedb_csv, include_qualitative=True): quantitative = iedb_df.ix[iedb_df["Units"] == "nM"].copy() quantitative["measurement_type"] = "quantitative" + quantitative["measurement_inequality"] = "=" print("Quantitative measurements: %d" % len(quantitative)) qualitative = iedb_df.ix[iedb_df["Units"] != "nM"].copy() qualitative["measurement_type"] = "qualitative" print("Qualitative measurements: %d" % len(qualitative)) - non_mass_spec_qualitative = qualitative.ix[ - (~qualitative["Method/Technique"].str.contains("mass spec")) - ].copy() - non_mass_spec_qualitative["Quantitative measurement"] = ( - non_mass_spec_qualitative["Qualitative Measure"].map( - QUALITATIVE_TO_AFFINITY)) - print("Qualitative measurements after dropping MS: %d" % ( - len(non_mass_spec_qualitative))) + if not include_mass_spec: + qualitative = qualitative.ix[ + (~qualitative["Method/Technique"].str.contains("mass spec")) + ].copy() + + qualitative["Quantitative measurement"] = ( + qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY)) + qualitative["measurement_inequality"] = ( + qualitative["Qualitative Measure"].map(QUALITATIVE_TO_INEQUALITY)) + + print("Qualitative measurements (possibly after dropping MS): %d" % ( + len(qualitative))) iedb_df = pandas.concat( ( ([quantitative]) + - ([non_mass_spec_qualitative] if include_qualitative else [])), + ([qualitative] if include_qualitative else [])), ignore_index=True) print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts()) @@ -145,6 +222,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True): "Quantitative measurement" ].values train_data["measurement_source"] = iedb_df.category.values + train_data["measurement_inequality"] = iedb_df.measurement_inequality.values train_data["allele"] = iedb_df["allele"].values train_data["original_allele"] = iedb_df["Allele Name"].values @@ -159,7 +237,7 @@ def run(): dfs = [] for filename in args.data_iedb: - df = load_data_iedb(filename) + df = load_data_iedb(filename, include_mass_spec=args.include_iedb_mass_spec) dfs.append(df) for filename in args.data_kim2014: df = load_data_kim2014(filename) @@ -175,18 +253,31 @@ def run(): ] print("Kim2014 data now: %s" % str(df.shape)) dfs.append(df) + for filename in args.data_systemhc_atlas: + df = load_data_systemhc_atlas(filename) + dfs.append(df) + for filename in args.data_abelin_mass_spec: + df = load_data_abelin_mass_spec(filename) + dfs.append(df) df = pandas.concat(dfs, ignore_index=True) + print("Combined df: %s" % (str(df.shape))) + + print("Removing combined duplicates") + df = df.drop_duplicates(["allele", "peptide", "measurement_value"]) + print("New combined df: %s" % (str(df.shape))) + df = df[[ "allele", "peptide", "measurement_value", + "measurement_inequality", "measurement_type", "measurement_source", "original_allele", ]].sort_values(["allele", "peptide"]).dropna() - print("Combined df: %s" % (str(df.shape))) + print("Final combined df: %s" % (str(df.shape))) df.to_csv(args.out_csv, index=False) print("Wrote: %s" % args.out_csv) diff --git a/downloads-generation/data_kim2014/README.md b/downloads-generation/data_kim2014/README.md deleted file mode 100644 index bf42e01ccade69b63172d342c92a41d9e0497dcb..0000000000000000000000000000000000000000 --- a/downloads-generation/data_kim2014/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Kim 2014 Data - -This download contains the BD2009, BD2013, and BLIND datasets from [Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). BD2013 (augmented with more recent data from IEDB) are used to train the production MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data. - -These files are available on dropbox here: - - * https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt - * https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt - * https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt - -To generate this download run: - -``` -./GENERATE.sh -``` \ No newline at end of file diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..916a901e160340f03f2039a640c44bf2460bff55 --- /dev/null +++ b/downloads-generation/data_published/GENERATE.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# Download some published MHC I ligand data +# +# +set -e +set -x + +DOWNLOAD_NAME=data_published +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +# git rev-parse HEAD +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +# Download kim2014 data +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2009.mhci.public.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.20130222.mhci.public.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.2013.mhci.public.blind.1.txt + +# Download abelin et al 2017 data +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/abelin2017.hits.csv.bz2 + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/data_published/README.md b/downloads-generation/data_published/README.md new file mode 100644 index 0000000000000000000000000000000000000000..807adbef7bc6a293c4bd71f5fe1af13f33c47e5d --- /dev/null +++ b/downloads-generation/data_published/README.md @@ -0,0 +1,24 @@ +# Published datasets + +These datasets are derived from publications and do not change. + +To generate this download run: + +``` +./GENERATE.sh +``` + +## Kim 2014 + +This download contains the BD2009, BD2013, and BLIND datasets from +[Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). + +BD2013 (augmented with more recent data from IEDB) are used to train the production +MHCflurry models. BD2009 and BLIND are useful for performing validation on held-out data. + + +## Abelin et al. Immunity 2017 + +This download contains the peptides identified in +[Mass Spectrometry Profiling of HLA-Associated Peptidomes in Mono-allelic Cells Enables More Accurate Epitope Prediction](https://www.ncbi.nlm.nih.gov/pubmed/28228285). + diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_systemhcatlas/GENERATE.sh similarity index 66% rename from downloads-generation/data_kim2014/GENERATE.sh rename to downloads-generation/data_systemhcatlas/GENERATE.sh index dbda0fe8c12df076e5e8f3b96728eefda51f23c6..1558409c2b981591681ae726a641e51283935ad9 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_systemhcatlas/GENERATE.sh @@ -1,12 +1,12 @@ #!/bin/bash # -# Download some published MHC I ligand data from a location on Dropbox. +# Download some published MHC I ligands identified by mass-spec # # set -e set -x -DOWNLOAD_NAME=data_kim2014 +DOWNLOAD_NAME=data_systemhcatlas SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" @@ -26,9 +26,9 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt -wget --quiet https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt +wget --quiet https://github.com/openvax/mhcflurry/releases/download/pre-1.1/systemhc.20171121.combined.csv.bz2 + +mv systemhc.20171121.combined.csv.bz2 data.csv.bz2 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/data_systemhcatlas/README.md b/downloads-generation/data_systemhcatlas/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5e66d9af8c400d8cceeaa4762bbb43bbea0493f8 --- /dev/null +++ b/downloads-generation/data_systemhcatlas/README.md @@ -0,0 +1,10 @@ +# SysteMHC database dump + +This is a data dump of the [SysteMHC Atlas](https://systemhcatlas.org/) provided +by personal communication. It is distributed under the ODC Open Database License. + +To generate this download run: + +``` +./GENERATE.sh +``` \ No newline at end of file diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index b72334b536cca453300b52257eb8e9c65c7e0dd3..275e5c1b2efa49142bea2c690c718789890f7c7d 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -29,14 +29,15 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME mkdir models -cp $SCRIPT_DIR/hyperparameters.yaml . +python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml time mhcflurry-class1-train-allele-specific-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ --hyperparameters hyperparameters.yaml \ --out-models-dir models \ - --percent-rank-calibration-num-peptides-per-length 1000000 \ - --min-measurements-per-allele 75 + --percent-rank-calibration-num-peptides-per-length 100000 \ + --min-measurements-per-allele 75 \ + --num-jobs 32 16 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/models_class1/generate_hyperparameters.py b/downloads-generation/models_class1/generate_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..dcd48aa8cd3a4f8f9628f0260012de1adc656a5a --- /dev/null +++ b/downloads-generation/models_class1/generate_hyperparameters.py @@ -0,0 +1,79 @@ +""" +Generate grid of hyperparameters +""" + +from sys import stdout +from copy import deepcopy +from yaml import dump + +base_hyperparameters = { + ########################################## + # ENSEMBLE SIZE + ########################################## + "n_models": 1, + + ########################################## + # OPTIMIZATION + ########################################## + "max_epochs": 500, + "patience": 20, + "early_stopping": True, + "validation_split": 0.1, + "minibatch_size": 128, + "loss": "custom:mse_with_inequalities", + + ########################################## + # RANDOM NEGATIVE PEPTIDES + ########################################## + "random_negative_rate": 0.2, + "random_negative_constant": 25, + "random_negative_affinity_min": 20000.0, + "random_negative_affinity_max": 50000.0, + + ########################################## + # PEPTIDE REPRESENTATION + ########################################## + # One of "one-hot", "embedding", or "BLOSUM62". + "peptide_amino_acid_encoding": "BLOSUM62", + "use_embedding": False, # maintained for backward compatability + "embedding_output_dim": 8, # only used if using embedding + "kmer_size": 15, + + ########################################## + # NEURAL NETWORK ARCHITECTURE + ########################################## + "locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } + ], + "activation": "tanh", + "output_activation": "sigmoid", + "layer_sizes": [16], + "dense_layer_l1_regularization": 0.001, + "batch_normalization": False, + "dropout_probability": 0.0, +} + +grid = [] +for dense_layer_size in [64, 32, 16]: + for l1 in [0.001, 0.01, 0.0]: + for num_lc in [0, 1, 2]: + for lc_kernel_size in [3, 5]: + new = deepcopy(base_hyperparameters) + new["layer_sizes"] = [dense_layer_size] + new["dense_layer_l1_regularization"] = l1 + (lc_layer,) = new["locally_connected_layers"] + lc_layer['kernel_size'] = lc_kernel_size + if num_lc == 0: + new["locally_connected_layers"] = [] + elif num_lc == 1: + new["locally_connected_layers"] = [lc_layer] + elif num_lc == 2: + new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)] + if not grid or new not in grid: + grid.append(new) + +dump(grid, stdout) diff --git a/downloads-generation/models_class1/hyperparameters.test.json b/downloads-generation/models_class1/hyperparameters.test.json deleted file mode 100644 index 609fa583539ba82bae8179756204dfaa91e156df..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1/hyperparameters.test.json +++ /dev/null @@ -1,37 +0,0 @@ -[ - { - "n_models": 8, - "max_epochs": 2, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1/hyperparameters.yaml deleted file mode 100644 index 9d38ad1af3f7c39b576e60e218fd9fb85eb4cd5f..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1/hyperparameters.yaml +++ /dev/null @@ -1,49 +0,0 @@ -[{ -########################################## -# ENSEMBLE SIZE -########################################## -"n_models": 8, - -########################################## -# OPTIMIZATION -########################################## -"max_epochs": 500, -"patience": 20, -"early_stopping": true, -"validation_split": 0.2, -"minibatch_size": 128, - -########################################## -# RANDOM NEGATIVE PEPTIDES -########################################## -"random_negative_rate": 0.0, -"random_negative_constant": 25, -"random_negative_affinity_min": 20000.0, -"random_negative_affinity_max": 50000.0, - -########################################## -# PEPTIDE REPRESENTATION -########################################## -# One of "one-hot", "embedding", or "BLOSUM62". -"peptide_amino_acid_encoding": "BLOSUM62", -"use_embedding": false, # maintained for backward compatability -"embedding_output_dim": 8, # only used if using embedding -"kmer_size": 15, - -########################################## -# NEURAL NETWORK ARCHITECTURE -########################################## -"locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } -], -"activation": "relu", -"output_activation": "sigmoid", -"layer_sizes": [16], -"dense_layer_l1_regularization": 0.001, -"batch_normalization": false, -"dropout_probability": 0.0, -}] diff --git a/downloads-generation/models_class1_no_mass_spec/GENERATE.sh b/downloads-generation/models_class1_no_mass_spec/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..20b749784ac1d2936d16fbe6504173cf9cac7fec --- /dev/null +++ b/downloads-generation/models_class1_no_mass_spec/GENERATE.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Train standard MHCflurry Class I models. +# Calls mhcflurry-class1-train-allele-specific-models on curated training data +# using the hyperparameters in "hyperparameters.yaml". +# +set -e +set -x + +DOWNLOAD_NAME=models_class1_no_mass_spec +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +mkdir models + +python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml + +time mhcflurry-class1-train-allele-specific-models \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \ + --hyperparameters hyperparameters.yaml \ + --out-models-dir models \ + --percent-rank-calibration-num-peptides-per-length 100000 \ + --min-measurements-per-allele 75 \ + --num-jobs 32 16 + +cp $SCRIPT_ABSOLUTE_PATH . +bzip2 LOG.txt +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/models_class1_no_mass_spec/README.md b/downloads-generation/models_class1_no_mass_spec/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce784b40a1531cf4bcdec29cfcb9f8d32fb6039a --- /dev/null +++ b/downloads-generation/models_class1_no_mass_spec/README.md @@ -0,0 +1,9 @@ +# Class I allele-specific models (ensemble) + +This download contains trained MHC Class I MHCflurry models. + +To generate this download run: + +``` +./GENERATE.sh +``` \ No newline at end of file diff --git a/downloads-generation/models_class1_no_mass_spec/generate_hyperparameters.py b/downloads-generation/models_class1_no_mass_spec/generate_hyperparameters.py new file mode 120000 index 0000000000000000000000000000000000000000..5f2599b4b2b4c418d1c296b43c55be87bb4d85ce --- /dev/null +++ b/downloads-generation/models_class1_no_mass_spec/generate_hyperparameters.py @@ -0,0 +1 @@ +../models_class1/generate_hyperparameters.py \ No newline at end of file diff --git a/lint.sh b/lint.sh index 32fbffb20ce4f5ea07b8743b0c7416bd7f14bfbe..1679b2381979a9aca4934362f9a3f43c2d597fb3 100755 --- a/lint.sh +++ b/lint.sh @@ -7,7 +7,7 @@ set -o errexit # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and # - https://bitbucket.org/logilab/pylint/issues/58 -find . -name '*.py' -not -name 'versioneer.py' -not -name _version.py \ +find . -name '*.py' -not -path "./docs/*" \ | xargs pylint \ --errors-only \ --disable=print-statement,unsubscriptable-object,not-an-iterable,no-member diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 600882fd9693f56f8cc89f45c91f8906ee899d3d..8538c4d3a9afb4f3b3b07b4de7c38c85f765be0a 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,7 +1,6 @@ -from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor -from mhcflurry.class1_neural_network import Class1NeuralNetwork - -__version__ = "1.0.0" +from .class1_affinity_predictor import Class1AffinityPredictor +from .class1_neural_network import Class1NeuralNetwork +from .version import __version__ __all__ = [ "__version__", diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..1fb7b1319f226b90a8c32bef513f049585bec46b --- /dev/null +++ b/mhcflurry/allele_encoding.py @@ -0,0 +1,70 @@ +import numpy +import pandas + +from .encodable_sequences import EncodableSequences +from . import amino_acid + +class AlleleEncoding(object): + def __init__( + self, + alleles, + allele_to_fixed_length_sequence=None): + """ + A place to cache encodings for a (potentially large) sequence of alleles. + + Parameters + ---------- + alleles : list of string + Allele names + + allele_to_fixed_length_sequence : dict of str -> str + Allele name to fixed lengths sequence ("pseudosequence") + """ + + alleles = pandas.Series(alleles) + + all_alleles = list(sorted(alleles.unique())) + + self.allele_to_index = dict( + (allele, i) + for (i, allele) in enumerate(all_alleles)) + + self.indices = alleles.map(self.allele_to_index) + + self.fixed_length_sequences = pandas.Series( + [allele_to_fixed_length_sequence[a] for a in all_alleles], + index=all_alleles) + + self.encoding_cache = {} + + def fixed_length_vector_encoded_sequences(self, vector_encoding_name): + """ + Encode alleles. + + Parameters + ---------- + vector_encoding_name : string + How to represent amino acids. + One of "BLOSUM62", "one-hot", etc. Full list of supported vector + encodings is given by available_vector_encodings() in amino_acid. + + Returns + ------- + numpy.array with shape (num sequences, sequence length, m) where m is + vector_encoding_length(vector_encoding_name) + """ + cache_key = ( + "fixed_length_vector_encoding", + vector_encoding_name) + if cache_key not in self.encoding_cache: + index_encoded_matrix = amino_acid.index_encoding( + self.fixed_length_sequences.values, + amino_acid.AMINO_ACID_INDEX) + vector_encoded = amino_acid.fixed_vectors_encoding( + index_encoded_matrix, + amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) + result = vector_encoded[self.indices] + self.encoding_cache[cache_key] = result + return self.encoding_cache[cache_key] + + diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index 1077c64443c505ecb6961df50250e8ea89e586ee..6095c58fb1bc02d017bbd0fee1b1a62d074770f6 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -153,7 +153,7 @@ def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df): target_shape = ( num_sequences, sequence_length, letter_to_vector_df.shape[0]) result = letter_to_vector_df.iloc[ - index_encoded_sequences.flat + index_encoded_sequences.flatten() ].values.reshape(target_shape) return result diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 1c523e35be470a8eecdf5ae218a9a110b29ec075..0b90d8c3b1a565ad1285adb576316150eee1cc47 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -2,11 +2,13 @@ import collections import hashlib import json import logging -import sys import time import warnings -from os.path import join, exists +from os.path import join, exists, abspath from os import mkdir +from socket import gethostname +from getpass import getuser +from functools import partial import mhcnames import numpy @@ -14,12 +16,15 @@ import pandas from numpy.testing import assert_equal from six import string_types -from mhcflurry.class1_neural_network import Class1NeuralNetwork -from mhcflurry.common import random_peptides -from mhcflurry.downloads import get_path -from mhcflurry.encodable_sequences import EncodableSequences -from mhcflurry.percent_rank_transform import PercentRankTransform -from mhcflurry.regression_target import to_ic50 +from .class1_neural_network import Class1NeuralNetwork +from .common import random_peptides +from .downloads import get_default_class1_models_dir +from .encodable_sequences import EncodableSequences +from .percent_rank_transform import PercentRankTransform +from .regression_target import to_ic50 +from .version import __version__ +from .ensemble_centrality import CENTRALITY_MEASURES +from .allele_encoding import AlleleEncoding class Class1AffinityPredictor(object): @@ -35,7 +40,7 @@ class Class1AffinityPredictor(object): self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, - allele_to_pseudosequence=None, + allele_to_fixed_length_sequence=None, manifest_df=None, allele_to_percent_rank_transform=None): """ @@ -47,7 +52,7 @@ class Class1AffinityPredictor(object): class1_pan_allele_models : list of `Class1NeuralNetwork` Ensemble of pan-allele models. - allele_to_pseudosequence : dict of string -> string + allele_to_fixed_length_sequence : dict of string -> string Required only if class1_pan_allele_models is specified. manifest_df : `pandas.DataFrame`, optional @@ -66,11 +71,11 @@ class Class1AffinityPredictor(object): class1_pan_allele_models = [] if class1_pan_allele_models: - assert allele_to_pseudosequence, "Pseudosequences required" + assert allele_to_fixed_length_sequence, "Allele sequences required" self.allele_to_allele_specific_models = allele_to_allele_specific_models self.class1_pan_allele_models = class1_pan_allele_models - self.allele_to_pseudosequence = allele_to_pseudosequence + self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence if manifest_df is None: rows = [] @@ -136,7 +141,7 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models = collections.defaultdict(list) class1_pan_allele_models = [] - allele_to_pseudosequence = predictors[0].allele_to_pseudosequence + allele_to_fixed_length_sequence = predictors[0].allele_to_fixed_length_sequence for predictor in predictors: for (allele, networks) in ( @@ -148,7 +153,7 @@ class Class1AffinityPredictor(object): return Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_pseudosequence=allele_to_pseudosequence + allele_to_fixed_length_sequence=allele_to_fixed_length_sequence ) @property @@ -161,8 +166,8 @@ class Class1AffinityPredictor(object): list of string """ result = set(self.allele_to_allele_specific_models) - if self.allele_to_pseudosequence: - result = result.union(self.allele_to_pseudosequence) + if self.allele_to_fixed_length_sequence: + result = result.union(self.allele_to_fixed_length_sequence) return sorted(result) @property @@ -192,8 +197,9 @@ class Class1AffinityPredictor(object): The serialization format consists of a file called "manifest.csv" with the configurations of each Class1NeuralNetwork, along with per-network files giving the model weights. If there are pan-allele predictors in - the ensemble, the allele pseudosequences are also stored in the - directory. + the ensemble, the allele sequences are also stored in the + directory. There is also a small file "index.txt" with basic metadata: + when the models were trained, by whom, on what host. Parameters ---------- @@ -234,6 +240,26 @@ class Class1AffinityPredictor(object): write_manifest_df.to_csv(manifest_path, index=False) logging.info("Wrote: %s" % manifest_path) + # Write "info.txt" + info_path = join(models_dir, "info.txt") + rows = [ + ("trained on", time.asctime()), + ("package ", "mhcflurry %s" % __version__), + ("hostname ", gethostname()), + ("user ", getuser()), + ] + pandas.DataFrame(rows).to_csv( + info_path, sep="\t", header=False, index=False) + + if self.allele_to_fixed_length_sequence is not None: + allele_to_sequence_df = pandas.DataFrame( + list(self.allele_to_fixed_length_sequence.items()), + columns=['allele', 'sequence'] + ) + allele_to_sequence_df.to_csv( + join(models_dir, "allele_sequences.csv"), index=False) + logging.info("Wrote: %s" % join(models_dir, "allele_sequences.csv")) + if self.allele_to_percent_rank_transform: percent_ranks_df = None for (allele, transform) in self.allele_to_percent_rank_transform.items(): @@ -267,7 +293,7 @@ class Class1AffinityPredictor(object): `Class1AffinityPredictor` instance """ if models_dir is None: - models_dir = get_path("models_class1", "models") + models_dir = get_default_class1_models_dir() manifest_path = join(models_dir, "manifest.csv") manifest_df = pandas.read_csv(manifest_path, nrows=max_models) @@ -278,9 +304,14 @@ class Class1AffinityPredictor(object): for (_, row) in manifest_df.iterrows(): weights_filename = Class1AffinityPredictor.weights_path( models_dir, row.model_name) - weights = Class1AffinityPredictor.load_weights(weights_filename) config = json.loads(row.config_json) - model = Class1NeuralNetwork.from_config(config, weights=weights) + + # We will lazy-load weights when the network is used. + model = Class1NeuralNetwork.from_config( + config, + weights_loader=partial( + Class1AffinityPredictor.load_weights, + abspath(weights_filename))) if row.allele == "pan-class1": class1_pan_allele_models.append(model) else: @@ -289,10 +320,10 @@ class Class1AffinityPredictor(object): manifest_df["model"] = all_models - pseudosequences = None - if exists(join(models_dir, "pseudosequences.csv")): - pseudosequences = pandas.read_csv( - join(models_dir, "pseudosequences.csv"), + allele_to_fixed_length_sequence = None + if exists(join(models_dir, "allele_sequences.csv")): + allele_to_fixed_length_sequence = pandas.read_csv( + join(models_dir, "allele_sequences.csv"), index_col="allele").to_dict() allele_to_percent_rank_transform = {} @@ -304,10 +335,10 @@ class Class1AffinityPredictor(object): PercentRankTransform.from_series(percent_ranks_df[allele])) logging.info( - "Loaded %d class1 pan allele predictors, %d pseudosequences, " + "Loaded %d class1 pan allele predictors, %d allele sequences, " "%d percent rank distributions, and %d allele specific models: %s" % ( len(class1_pan_allele_models), - len(pseudosequences) if pseudosequences else 0, + len(allele_to_fixed_length_sequence) if allele_to_fixed_length_sequence else 0, len(allele_to_percent_rank_transform), sum(len(v) for v in allele_to_allele_specific_models.values()), ", ".join( @@ -318,7 +349,7 @@ class Class1AffinityPredictor(object): result = Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_pseudosequence=pseudosequences, + allele_to_fixed_length_sequence=allele_to_fixed_length_sequence, manifest_df=manifest_df, allele_to_percent_rank_transform=allele_to_percent_rank_transform, ) @@ -362,12 +393,13 @@ class Class1AffinityPredictor(object): def fit_allele_specific_predictors( self, n_models, - architecture_hyperparameters, + architecture_hyperparameters_list, allele, peptides, affinities, + inequalities=None, models_dir_for_save=None, - verbose=1, + verbose=0, progress_preamble=""): """ Fit one or more allele specific predictors for a single allele using a @@ -381,7 +413,10 @@ class Class1AffinityPredictor(object): n_models : int Number of neural networks to fit - architecture_hyperparameters : dict + architecture_hyperparameters_list : list of dict + List of hyperparameters. If more than one set of hyperparameters + are specified, model selection over them is performed and model + with the lowest validation loss is selected for each fold. allele : string @@ -389,6 +424,9 @@ class Class1AffinityPredictor(object): affinities : list of float nM affinities + + inequalities : list of string, each element one of ">", "<", or "=" + See Class1NeuralNetwork.fit for details. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written @@ -406,35 +444,83 @@ class Class1AffinityPredictor(object): """ allele = mhcnames.normalize_allele_name(allele) - models = self._fit_predictors( - n_models=n_models, - architecture_hyperparameters=architecture_hyperparameters, - peptides=peptides, - affinities=affinities, - allele_pseudosequences=None, - verbose=verbose, - progress_preamble=progress_preamble) - if allele not in self.allele_to_allele_specific_models: self.allele_to_allele_specific_models[allele] = [] - models_list = [] - for (i, model) in enumerate(models): - model_name = self.model_name(allele, i) - models_list.append(model) # models is a generator + encodable_peptides = EncodableSequences.create(peptides) + + n_architectures = len(architecture_hyperparameters_list) + if n_models > 1 or n_architectures > 1: + # Adjust progress info to indicate number of models and + # architectures. + pieces = [] + if n_models > 1: + pieces.append("Model {model_num:2d} / {n_models:2d}") + if n_architectures > 1: + pieces.append( + "Architecture {architecture_num:2d} / {n_architectures:2d}" + " (best so far: {best_num})") + progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( + ", ".join(pieces)) + else: + # Just use the user-provided progress message. + progress_preamble_template = "{user_progress_preamble}" + + models = [] + for model_num in range(n_models): + shuffle_permutation = numpy.random.permutation(len(affinities)) + + best_num = None + best_loss = None + best_model = None + for (architecture_num, architecture_hyperparameters) in enumerate( + architecture_hyperparameters_list): + model = Class1NeuralNetwork(**architecture_hyperparameters) + model.fit( + encodable_peptides, + affinities, + shuffle_permutation=shuffle_permutation, + inequalities=inequalities, + verbose=verbose, + progress_preamble=progress_preamble_template.format( + user_progress_preamble=progress_preamble, + best_num="n/a" if best_num is None else best_num + 1, + model_num=model_num + 1, + n_models=n_models, + architecture_num=architecture_num + 1, + n_architectures=n_architectures)) + + if n_architectures > 1: + # We require val_loss (i.e. a validation set) if we have + # multiple architectures. + loss = model.loss_history['val_loss'][-1] + else: + loss = None + if loss is None or best_loss is None or best_loss > loss: + best_loss = loss + best_num = architecture_num + best_model = model + del model + + if n_architectures > 1: + print("Selected architecture %d." % (best_num + 1)) + + model_name = self.model_name(allele, model_num) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), ("allele", allele), - ("config_json", json.dumps(model.get_config())), - ("model", model), + ("config_json", json.dumps(best_model.get_config())), + ("model", best_model), ])).to_frame().T self.manifest_df = pandas.concat( [self.manifest_df, row], ignore_index=True) - self.allele_to_allele_specific_models[allele].append(model) + self.allele_to_allele_specific_models[allele].append(best_model) if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models_list + models.append(best_model) + + return models def fit_class1_pan_allele_models( self, @@ -443,6 +529,7 @@ class Class1AffinityPredictor(object): alleles, peptides, affinities, + inequalities, models_dir_for_save=None, verbose=1, progress_preamble=""): @@ -461,12 +548,15 @@ class Class1AffinityPredictor(object): architecture_hyperparameters : dict alleles : list of string - Allele names (not pseudosequences) corresponding to each peptide + Allele names (not sequences) corresponding to each peptide peptides : `EncodableSequences` or list of string affinities : list of float nM affinities + + inequalities : list of string, each element one of ">", "<", or "=" + See Class1NeuralNetwork.fit for details. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written @@ -484,21 +574,24 @@ class Class1AffinityPredictor(object): """ alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name) - allele_pseudosequences = alleles.map(self.allele_to_pseudosequence) + allele_encoding = AlleleEncoding( + alleles, + allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) - models = self._fit_predictors( - n_models=n_models, - architecture_hyperparameters=architecture_hyperparameters, - peptides=peptides, - affinities=affinities, - allele_pseudosequences=allele_pseudosequences, - verbose=verbose, - progress_preamble=progress_preamble) + encodable_peptides = EncodableSequences.create(peptides) + models = [] + for i in range(n_models): + logging.info("Training model %d / %d" % (i + 1, n_models)) + model = Class1NeuralNetwork(**architecture_hyperparameters) + model.fit( + encodable_peptides, + affinities, + inequalities=inequalities, + allele_encoding=allele_encoding, + verbose=verbose, + progress_preamble=progress_preamble) - models_list = [] - for (i, model) in enumerate(models): model_name = self.model_name("pan-class1", i) - models_list.append(model) # models is a generator self.class1_pan_allele_models.append(model) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), @@ -511,113 +604,9 @@ class Class1AffinityPredictor(object): if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models_list + models.append(model) - def _fit_predictors( - self, - n_models, - architecture_hyperparameters, - peptides, - affinities, - allele_pseudosequences, - verbose=1, - progress_preamble = ""): - """ - Private helper method - - Parameters - ---------- - n_models : int - architecture_hyperparameters : dict - peptides : EncodableSequences or list of string - affinities : list of float - allele_pseudosequences : EncodableSequences or list of string - verbose : int - progress_preamble : string - Optional string of information to include in each progress update - - Returns - ------- - generator of `Class1NeuralNetwork` - """ - encodable_peptides = EncodableSequences.create(peptides) - for i in range(n_models): - logging.info("Training model %d / %d" % (i + 1, n_models)) - model = Class1NeuralNetwork(**architecture_hyperparameters) - model.fit( - encodable_peptides, - affinities, - allele_pseudosequences=allele_pseudosequences, - verbose=verbose, - progress_preamble=progress_preamble) - yield model - - def calibrate_percentile_ranks( - self, - peptides=None, - num_peptides_per_length=int(1e5), - alleles=None, - bins=None, - quiet=False): - """ - Compute the cumulative distribution of ic50 values for a set of alleles - over a large universe of random peptides, to enable computing quantiles in - this distribution later. - - Parameters - ---------- - peptides : sequence of string, optional - Peptides to use - num_peptides_per_length : int, optional - If peptides argument is not specified, then num_peptides_per_length - peptides are randomly sampled from a uniform distribution for each - supported length - alleles : sequence of string, optional - Alleles to perform calibration for. If not specified all supported - alleles will be calibrated. - bins : object - Anything that can be passed to numpy.histogram's "bins" argument - can be used here, i.e. either an integer or a sequence giving bin - edges. This is in ic50 space. - quiet : boolean - If False (default), status updates will be printed to stdout. - """ - if bins is None: - bins = to_ic50(numpy.linspace(1, 0, 1000)) - - if alleles is None: - alleles = self.supported_alleles - - if peptides is None: - peptides = [] - lengths = range( - self.supported_peptide_lengths[0], - self.supported_peptide_lengths[1] + 1) - for length in lengths: - peptides.extend( - random_peptides(num_peptides_per_length, length)) - - if quiet: - def msg(s): - pass - else: - def msg(s): - print(s) - sys.stdout.flush() - - encoded_peptides = EncodableSequences.create(peptides) - for (i, allele) in enumerate(alleles): - msg("Calibrating percentile ranks for allele %03d/%03d: %s" % ( - i + 1, len(alleles), allele)) - start = time.time() - predictions = self.predict(encoded_peptides, allele=allele) - msg("Generated %d predictions in %0.2f sec." % ( - len(predictions), time.time() - start)) - transform = PercentRankTransform() - transform.fit(predictions, bins=bins) - self.allele_to_percent_rank_transform[allele] = transform - msg("Done calibrating allele %s in %0.2f sec." % ( - allele, time.time() - start)) + return models def percentile_ranks(self, affinities, allele=None, alleles=None, throw=True): """ @@ -665,7 +654,13 @@ class Class1AffinityPredictor(object): sub_df.affinity, allele=allele, throw=throw) return df.result.values - def predict(self, peptides, alleles=None, allele=None, throw=True): + def predict( + self, + peptides, + alleles=None, + allele=None, + throw=True, + centrality_measure="robust_mean"): """ Predict nM binding affinities. @@ -686,6 +681,9 @@ class Class1AffinityPredictor(object): If True, a ValueError will be raised in the case of unsupported alleles or peptide lengths. If False, a warning will be logged and the predictions for the unsupported alleles or peptides will be NaN. + centrality_measure : string or callable + Measure of central tendency to use to combine predictions in the + ensemble. Options include: mean, median, robust_mean. Returns ------- @@ -697,6 +695,7 @@ class Class1AffinityPredictor(object): allele=allele, throw=throw, include_percentile_ranks=False, + centrality_measure=centrality_measure, ) return df.prediction.values @@ -707,7 +706,8 @@ class Class1AffinityPredictor(object): allele=None, throw=True, include_individual_model_predictions=False, - include_percentile_ranks=True): + include_percentile_ranks=True, + centrality_measure="robust_mean"): """ Predict nM binding affinities. Gives more detailed output than `predict` method, including 5-95% prediction intervals. @@ -736,6 +736,9 @@ class Class1AffinityPredictor(object): If True, a "prediction_percentile" column will be included giving the percentile ranks. If no percentile rank information is available, this will be ignored with a warning. + centrality_measure : string or callable + Measure of central tendency to use to combine predictions in the + ensemble. Options include: mean, median, robust_mean. Returns ------- @@ -794,27 +797,27 @@ class Class1AffinityPredictor(object): unsupported_alleles = [ allele for allele in df.normalized_allele.unique() - if allele not in self.allele_to_pseudosequence + if allele not in self.allele_to_fixed_length_sequence ] if unsupported_alleles: msg = ( - "No pseudosequences for allele(s): %s.\n" + "No sequences for allele(s): %s.\n" "Supported alleles: %s" % ( " ".join(unsupported_alleles), - " ".join(sorted(self.allele_to_pseudosequence)))) + " ".join(sorted(self.allele_to_fixed_length_sequence)))) logging.warning(msg) if throw: raise ValueError(msg) mask = df.supported_peptide_length if mask.sum() > 0: - masked_allele_pseudosequences = ( - df.ix[mask].normalized_allele.map( - self.allele_to_pseudosequence)) + masked_allele_encoding = AlleleEncoding( + df.loc[mask].normalized_allele, + allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) masked_peptides = peptides.sequences[mask] for (i, model) in enumerate(self.class1_pan_allele_models): df.loc[mask, "model_pan_%d" % i] = model.predict( masked_peptides, - allele_pseudosequences=masked_allele_pseudosequences) + allele_encoding=masked_allele_encoding) if self.allele_to_allele_specific_models: query_alleles = df.normalized_allele.unique() @@ -852,9 +855,15 @@ class Class1AffinityPredictor(object): df_predictions = df[ [c for c in df.columns if c.startswith("model_")] ] + + if callable(centrality_measure): + centrality_function = centrality_measure + else: + centrality_function = CENTRALITY_MEASURES[centrality_measure] + logs = numpy.log(df_predictions) - log_means = logs.mean(1) - df["prediction"] = numpy.exp(log_means) + log_centers = centrality_function(logs.values) + df["prediction"] = numpy.exp(log_centers) df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1)) df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1)) @@ -918,3 +927,61 @@ class Class1AffinityPredictor(object): ] loaded.close() return weights + + def calibrate_percentile_ranks( + self, + peptides=None, + num_peptides_per_length=int(1e5), + alleles=None, + bins=None): + """ + Compute the cumulative distribution of ic50 values for a set of alleles + over a large universe of random peptides, to enable computing quantiles in + this distribution later. + + Parameters + ---------- + peptides : sequence of string or EncodableSequences, optional + Peptides to use + num_peptides_per_length : int, optional + If peptides argument is not specified, then num_peptides_per_length + peptides are randomly sampled from a uniform distribution for each + supported length + alleles : sequence of string, optional + Alleles to perform calibration for. If not specified all supported + alleles will be calibrated. + bins : object + Anything that can be passed to numpy.histogram's "bins" argument + can be used here, i.e. either an integer or a sequence giving bin + edges. This is in ic50 space. + + Returns + ---------- + EncodableSequences : peptides used for calibration + """ + if bins is None: + bins = to_ic50(numpy.linspace(1, 0, 1000)) + + if alleles is None: + alleles = self.supported_alleles + + if peptides is None: + peptides = [] + lengths = range( + self.supported_peptide_lengths[0], + self.supported_peptide_lengths[1] + 1) + for length in lengths: + peptides.extend( + random_peptides(num_peptides_per_length, length)) + + encoded_peptides = EncodableSequences.create(peptides) + + for (i, allele) in enumerate(alleles): + predictions = self.predict(encoded_peptides, allele=allele) + transform = PercentRankTransform() + transform.fit(predictions, bins=bins) + self.allele_to_percent_rank_transform[allele] = transform + + return encoded_peptides + + diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index eafbbf96398e58eb0028a8bfaadab62835377a67..936e75119f6974fb6b619ad4f57461a9a7e50339 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -5,12 +5,13 @@ import logging import numpy import pandas -from mhcflurry.hyperparameters import HyperparameterDefaults +from .hyperparameters import HyperparameterDefaults -from mhcflurry.encodable_sequences import EncodableSequences -from mhcflurry.amino_acid import available_vector_encodings, vector_encoding_length -from mhcflurry.regression_target import to_ic50, from_ic50 -from mhcflurry.common import random_peptides, amino_acid_distribution +from .encodable_sequences import EncodableSequences +from .amino_acid import available_vector_encodings, vector_encoding_length +from .regression_target import to_ic50, from_ic50 +from .common import random_peptides, amino_acid_distribution +from .custom_loss import CUSTOM_LOSSES class Class1NeuralNetwork(object): @@ -26,11 +27,13 @@ class Class1NeuralNetwork(object): network_hyperparameter_defaults = HyperparameterDefaults( kmer_size=15, - use_embedding=False, peptide_amino_acid_encoding="one-hot", embedding_input_dim=21, embedding_output_dim=8, - pseudosequence_use_embedding=False, + allele_dense_layer_sizes=[], + peptide_dense_layer_sizes=[], + peptide_allele_merge_method="multiply", + peptide_allele_merge_activation="", layer_sizes=[32], dense_layer_l1_regularization=0.001, dense_layer_l2_regularization=0.0, @@ -106,13 +109,46 @@ class Class1NeuralNetwork(object): Combined set of all supported hyperparameters and their default values. """ + # Hyperparameter renames. + # These are updated from time to time as new versions are developed. It + # provides a primitive way to allow new code to work with models trained + # using older code. + # None indicates the hyperparameter has been dropped. + hyperparameter_renames = { + "use_embedding": None, + "pseudosequence_use_embedding": None, + } + + @classmethod + def apply_hyperparameter_renames(cls, hyperparameters): + """ + Handle hyperparameter renames. + + Parameters + ---------- + hyperparameters : dict + + Returns + ------- + dict : updated hyperparameters + + """ + for (from_name, to_name) in cls.hyperparameter_renames.items(): + if from_name in hyperparameters: + value = hyperparameters.pop(from_name) + if to_name: + hyperparameters[to_name] = value + return hyperparameters + + def __init__(self, **hyperparameters): self.hyperparameters = self.hyperparameter_defaults.with_defaults( - hyperparameters) + self.apply_hyperparameter_renames(hyperparameters)) self._network = None self.network_json = None self.network_weights = None + self.network_weights_loader = None self.loss_history = None self.fit_seconds = None @@ -124,6 +160,13 @@ class Class1NeuralNetwork(object): (Keras model, existing network weights) """ + @classmethod + def clear_model_cache(klass): + """ + Clear the Keras model cache. + """ + klass.KERAS_MODELS_CACHE.clear() + @classmethod def borrow_cached_network(klass, network_json, network_weights): """ @@ -175,6 +218,7 @@ class Class1NeuralNetwork(object): keras.models.Model """ if self._network is None and self.network_json is not None: + self.load_weights() if borrow: return self.borrow_cached_network( self.network_json, @@ -205,10 +249,11 @@ class Class1NeuralNetwork(object): result = dict(self.__dict__) result['_network'] = None result['network_weights'] = None + result['network_weights_loader'] = None return result @classmethod - def from_config(cls, config, weights=None): + def from_config(cls, config, weights=None, weights_loader=None): """ deserialize from a dict returned by get_config(). @@ -217,6 +262,8 @@ class Class1NeuralNetwork(object): config : dict weights : list of array, optional Network weights to restore + weights_loader : callable, optional + Function to call (no arguments) to load weights when needed Returns ------- @@ -227,8 +274,20 @@ class Class1NeuralNetwork(object): assert all(hasattr(instance, key) for key in config), config.keys() instance.__dict__.update(config) instance.network_weights = weights + instance.network_weights_loader = weights_loader return instance + def load_weights(self): + """ + Load weights by evaluating self.network_weights_loader, if needed. + + After calling this, self.network_weights_loader will be None and + self.network_weights will be the weights list, if available. + """ + if self.network_weights_loader: + self.network_weights = self.network_weights_loader() + self.network_weights_loader = None + def get_weights(self): """ Get the network weights @@ -239,6 +298,7 @@ class Class1NeuralNetwork(object): or None if there is no network """ self.update_network_description() + self.load_weights() return self.network_weights def __getstate__(self): @@ -251,7 +311,7 @@ class Class1NeuralNetwork(object): """ self.update_network_description() - self.update_network_description() + self.load_weights() result = dict(self.__dict__) result['_network'] = None return result @@ -270,8 +330,7 @@ class Class1NeuralNetwork(object): numpy.array """ encoder = EncodableSequences.create(peptides) - if (self.hyperparameters['use_embedding'] or - self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"): + if (self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"): encoded = encoder.variable_length_to_fixed_length_categorical( max_length=self.hyperparameters['kmer_size'], **self.input_encoding_hyperparameter_defaults.subselect( @@ -305,34 +364,29 @@ class Class1NeuralNetwork(object): self.hyperparameters['right_edge'], self.hyperparameters['kmer_size']) - def pseudosequence_to_network_input(self, pseudosequences): + def allele_encoding_to_network_input(self, allele_encoding): """ - Encode pseudosequences to the fixed-length encoding expected by the neural + Encode alleles to the fixed-length encoding expected by the neural network (which depends on the architecture). Parameters ---------- - pseudosequences : EncodableSequences or list of string + allele_encoding : AlleleEncoding Returns ------- numpy.array """ - encoder = EncodableSequences.create(pseudosequences) - if self.hyperparameters['pseudosequence_use_embedding']: - encoded = encoder.fixed_length_categorical() - else: - raise NotImplementedError - # encoded = encoder.fixed_length_one_hot() - assert len(encoded) == len(pseudosequences) - return encoded + return allele_encoding.fixed_length_vector_encoded_sequences("BLOSUM62") def fit( self, peptides, affinities, - allele_pseudosequences=None, + allele_encoding=None, + inequalities=None, sample_weights=None, + shuffle_permutation=None, verbose=1, progress_preamble=""): """ @@ -345,14 +399,26 @@ class Class1NeuralNetwork(object): affinities : list of float nM affinities. Must be same length of as peptides. - allele_pseudosequences : EncodableSequences or list of string, optional + allele_encoding : AlleleEncoding, optional If not specified, the model will be a single-allele predictor. + + inequalities : list of string, each element one of ">", "<", or "=". + Inequalities to use for fitting. Same length as affinities. + Each element must be one of ">", "<", or "=". For example, a ">" + will train on y_pred > y_true for that element in the training set. + Requires using a custom losses that support inequalities (e.g. + mse_with_ineqalities). + If None all inequalities are taken to be "=". sample_weights : list of float, optional If not specified, all samples (including random negatives added during training) will have equal weight. If specified, the random negatives will be assigned weight=1.0. - + + shuffle_permutation : list of int, optional + Permutation (integer list) of same length as peptides and affinities + If None, then a random permutation will be generated. + verbose : int Keras verbosity level @@ -391,37 +457,110 @@ class Class1NeuralNetwork(object): y_values = from_ic50(affinities) assert numpy.isnan(y_values).sum() == 0, numpy.isnan(y_values).sum() + if inequalities is not None: + # Reverse inequalities because from_ic50() flips the direction + # (i.e. lower affinity results in higher y values). + adjusted_inequalities = pandas.Series(inequalities).map({ + "=": "=", + ">": "<", + "<": ">", + }).values + else: + adjusted_inequalities = numpy.tile("=", len(y_values)) + if len(adjusted_inequalities) != len(y_values): + raise ValueError("Inequalities and y_values must have same length") x_dict_without_random_negatives = { 'peptide': peptide_encoding, } - pseudosequence_length = None - if allele_pseudosequences is not None: - pseudosequences_input = self.pseudosequence_to_network_input( - allele_pseudosequences) - pseudosequence_length = len(pseudosequences_input[0]) - x_dict_without_random_negatives['pseudosequence'] = ( - pseudosequences_input) + allele_encoding_dims = None + if allele_encoding is not None: + allele_encoding_input = self.allele_encoding_to_network_input( + allele_encoding) + allele_encoding_dims = allele_encoding_input.shape[1:] + x_dict_without_random_negatives['allele'] = allele_encoding_input + + # Shuffle y_values and the contents of x_dict_without_random_negatives + # This ensures different data is used for the test set for early stopping + # when multiple models are trained. + if shuffle_permutation is None: + shuffle_permutation = numpy.random.permutation(len(y_values)) + y_values = y_values[shuffle_permutation] + peptide_encoding = peptide_encoding[shuffle_permutation] + adjusted_inequalities = adjusted_inequalities[shuffle_permutation] + for key in x_dict_without_random_negatives: + x_dict_without_random_negatives[key] = ( + x_dict_without_random_negatives[key][shuffle_permutation]) + if sample_weights is not None: + sample_weights = sample_weights[shuffle_permutation] + + if self.hyperparameters['loss'].startswith("custom:"): + # Using a custom loss that supports inequalities + try: + custom_loss = CUSTOM_LOSSES[ + self.hyperparameters['loss'].replace("custom:", "") + ] + except KeyError: + raise ValueError( + "No such custom loss function: %s. Supported losses are: %s" % ( + self.hyperparameters['loss'], + ", ".join([ + "custom:" + loss_name for loss_name in CUSTOM_LOSSES + ]))) + loss_name_or_function = custom_loss.loss + loss_supports_inequalities = custom_loss.supports_inequalities + loss_encode_y_function = custom_loss.encode_y + else: + # Using a regular keras loss. No inequalities supported. + loss_name_or_function = self.hyperparameters['loss'] + loss_supports_inequalities = False + loss_encode_y_function = None + + if not loss_supports_inequalities and ( + any(inequality != "=" for inequality in adjusted_inequalities)): + raise ValueError("Loss %s does not support inequalities" % ( + loss_name_or_function)) if self.network() is None: self._network = self.make_network( - pseudosequence_length=pseudosequence_length, + allele_encoding_dims=allele_encoding_dims, **self.network_hyperparameter_defaults.subselect( self.hyperparameters)) - self.compile() - - y_dict_with_random_negatives = { - "output": numpy.concatenate([ - from_ic50( - numpy.random.uniform( - self.hyperparameters[ - 'random_negative_affinity_min'], - self.hyperparameters[ - 'random_negative_affinity_max'], - int(num_random_negative.sum()))), - y_values, - ]), - } + self.network().compile( + loss=loss_name_or_function, + optimizer=self.hyperparameters['optimizer']) + + if loss_supports_inequalities: + # Do not sample negative affinities: just use an inequality. + random_negative_ic50 = self.hyperparameters['random_negative_affinity_min'] + random_negative_target = from_ic50(random_negative_ic50) + + y_dict_with_random_negatives = { + "output": numpy.concatenate([ + numpy.tile( + random_negative_target, int(num_random_negative.sum())), + y_values, + ]), + } + # Note: we are using "<" here not ">" because the inequalities are + # now in target-space (0-1) not affinity-space. + adjusted_inequalities_with_random_negatives = ( + ["<"] * int(num_random_negative.sum()) + + list(adjusted_inequalities)) + else: + # Randomly sample random negative affinities + y_dict_with_random_negatives = { + "output": numpy.concatenate([ + from_ic50( + numpy.random.uniform( + self.hyperparameters[ + 'random_negative_affinity_min'], + self.hyperparameters[ + 'random_negative_affinity_max'], + int(num_random_negative.sum()))), + y_values, + ]), + } if sample_weights is not None: sample_weights_with_random_negatives = numpy.concatenate([ numpy.ones(int(num_random_negative.sum())), @@ -429,6 +568,11 @@ class Class1NeuralNetwork(object): else: sample_weights_with_random_negatives = None + if loss_encode_y_function is not None: + y_dict_with_random_negatives['output'] = loss_encode_y_function( + y_dict_with_random_negatives['output'], + adjusted_inequalities_with_random_negatives) + val_losses = [] min_val_loss_iteration = None min_val_loss = None @@ -436,6 +580,7 @@ class Class1NeuralNetwork(object): self.loss_history = collections.defaultdict(list) start = time.time() last_progress_print = None + x_dict_with_random_negatives = {} for i in range(self.hyperparameters['max_epochs']): random_negative_peptides_list = [] for (length, count) in num_random_negative.iteritems(): @@ -444,21 +589,45 @@ class Class1NeuralNetwork(object): count, length=length, distribution=aa_distribution)) + random_negative_peptides = EncodableSequences.create( + random_negative_peptides_list) random_negative_peptides_encoding = ( - self.peptides_to_network_input( - random_negative_peptides_list)) - - x_dict_with_random_negatives = { - "peptide": numpy.concatenate([ - random_negative_peptides_encoding, - peptide_encoding, - ]) if len(random_negative_peptides_encoding) > 0 - else peptide_encoding - } - if pseudosequence_length: - # TODO: add random pseudosequences for random negative peptides - raise NotImplementedError( - "Allele pseudosequences unsupported with random negatives") + self.peptides_to_network_input(random_negative_peptides)) + + if not x_dict_with_random_negatives: + if len(random_negative_peptides) > 0: + x_dict_with_random_negatives["peptide"] = numpy.concatenate([ + random_negative_peptides_encoding, + peptide_encoding, + ]) + if 'allele' in x_dict_without_random_negatives: + x_dict_with_random_negatives['allele'] = numpy.concatenate([ + x_dict_without_random_negatives['allele'][ + numpy.random.choice( + x_dict_without_random_negatives[ + 'allele'].shape[0], + size=len(random_negative_peptides_list))], + x_dict_without_random_negatives['allele'] + ]) + else: + x_dict_with_random_negatives = ( + x_dict_without_random_negatives) + else: + # Update x_dict_with_random_negatives in place. + # This is more memory efficient than recreating it as above. + if len(random_negative_peptides) > 0: + x_dict_with_random_negatives["peptide"][:len(random_negative_peptides)] = ( + random_negative_peptides_encoding + ) + if 'allele' in x_dict_with_random_negatives: + x_dict_with_random_negatives['allele'][:len(random_negative_peptides)] = ( + x_dict_with_random_negatives['allele'][ + len(random_negative_peptides) + numpy.random.choice( + x_dict_with_random_negatives['allele'].shape[0] - + len(random_negative_peptides), + size=len(random_negative_peptides)) + ] + ) fit_history = self.network().fit( x_dict_with_random_negatives, @@ -499,7 +668,7 @@ class Class1NeuralNetwork(object): self.hyperparameters['patience']) if i > threshold: print((progress_preamble + " " + - "Early stopping at epoch %3d / %3d: loss=%g. " + "Stopping at epoch %3d / %3d: loss=%g. " "Min val loss (%s) at epoch %s" % ( i, self.hyperparameters['max_epochs'], @@ -509,7 +678,7 @@ class Class1NeuralNetwork(object): break self.fit_seconds = time.time() - start - def predict(self, peptides, allele_pseudosequences=None, batch_size=4096): + def predict(self, peptides, allele_encoding=None, batch_size=4096): """ Predict affinities @@ -517,7 +686,7 @@ class Class1NeuralNetwork(object): ---------- peptides : EncodableSequences or list of string - allele_pseudosequences : EncodableSequences or list of string, optional + allele_pseudosequences : AlleleEncoding, optional Only required when this model is a pan-allele model batch_size : int @@ -530,33 +699,26 @@ class Class1NeuralNetwork(object): x_dict = { 'peptide': self.peptides_to_network_input(peptides) } - if allele_pseudosequences is not None: - pseudosequences_input = self.pseudosequence_to_network_input( - allele_pseudosequences) - x_dict['pseudosequence'] = pseudosequences_input + if allele_encoding is not None: + allele_input = self.allele_encoding_to_network_input(allele_encoding) + x_dict['allele'] = allele_input network = self.network(borrow=True) raw_predictions = network.predict(x_dict, batch_size=batch_size) predictions = numpy.array(raw_predictions, dtype = "float64")[:,0] return to_ic50(predictions) - def compile(self): - """ - Compile the keras model. Used internally. - """ - self.network().compile( - **self.compile_hyperparameter_defaults.subselect( - self.hyperparameters)) - @staticmethod def make_network( - pseudosequence_length, + allele_encoding_dims, kmer_size, peptide_amino_acid_encoding, - use_embedding, embedding_input_dim, embedding_output_dim, - pseudosequence_use_embedding, + allele_dense_layer_sizes, + peptide_dense_layer_sizes, + peptide_allele_merge_method, + peptide_allele_merge_activation, layer_sizes, dense_layer_l1_regularization, dense_layer_l2_regularization, @@ -580,7 +742,7 @@ class Class1NeuralNetwork(object): from keras.layers.embeddings import Embedding from keras.layers.normalization import BatchNormalization - if use_embedding or peptide_amino_acid_encoding == "embedding": + if peptide_amino_acid_encoding == "embedding": peptide_input = Input( shape=(kmer_size,), dtype='int32', name='peptide') current_layer = Embedding( @@ -600,6 +762,12 @@ class Class1NeuralNetwork(object): inputs = [peptide_input] + kernel_regularizer = None + l1 = dense_layer_l1_regularization + l2 = dense_layer_l2_regularization + if l1 > 0 or l2 > 0: + kernel_regularizer = keras.regularizers.l1_l2(l1, l2) + for (i, locally_connected_params) in enumerate(locally_connected_layers): current_layer = keras.layers.LocallyConnected1D( name="lc_%d" % i, @@ -607,6 +775,13 @@ class Class1NeuralNetwork(object): current_layer = Flatten(name="flattened_0")(current_layer) + for (i, layer_size) in enumerate(peptide_dense_layer_sizes): + current_layer = Dense( + layer_size, + name="peptide_dense_%d" % i, + kernel_regularizer=kernel_regularizer, + activation=activation)(current_layer) + if batch_normalization: current_layer = BatchNormalization(name="batch_norm_early")( current_layer) @@ -615,37 +790,41 @@ class Class1NeuralNetwork(object): current_layer = Dropout(dropout_probability, name="dropout_early")( current_layer) - if pseudosequence_length: - if pseudosequence_use_embedding: - pseudosequence_input = Input( - shape=(pseudosequence_length,), - dtype='int32', - name='pseudosequence') - pseudo_embedding_layer = Embedding( - input_dim=embedding_input_dim, - output_dim=embedding_output_dim, - input_length=pseudosequence_length, - embeddings_initializer=embedding_init_method)( - pseudosequence_input) + if allele_encoding_dims: + allele_input = Input( + shape=allele_encoding_dims, + dtype='float32', + name='allele') + inputs.append(allele_input) + allele_embedding_layer = Flatten(name="allele_flat")(allele_input) + + for (i, layer_size) in enumerate(allele_dense_layer_sizes): + allele_embedding_layer = Dense( + layer_size, + name="allele_dense_%d" % i, + kernel_regularizer=kernel_regularizer, + activation=activation)(allele_embedding_layer) + + if peptide_allele_merge_method == 'concatenate': + current_layer = keras.layers.concatenate([ + current_layer, allele_embedding_layer + ], name="allele_peptide_merged") + elif peptide_allele_merge_method == 'multiply': + current_layer = keras.layers.multiply([ + current_layer, allele_embedding_layer + ], name="allele_peptide_merged") else: - pseudosequence_input = Input( - shape=(pseudosequence_length, 21), - dtype='float32', name='peptide') - pseudo_embedding_layer = pseudosequence_input - inputs.append(pseudosequence_input) - pseudo_embedding_layer = Flatten(name="flattened_1")( - pseudo_embedding_layer) - - current_layer = keras.layers.concatenate([ - current_layer, pseudo_embedding_layer], name="concatenated_0") + raise ValueError( + "Unsupported peptide_allele_encoding_merge_method: %s" + % peptide_allele_merge_method) + + if peptide_allele_merge_activation: + current_layer = keras.layers.Activation( + peptide_allele_merge_activation, + name="alelle_peptide_merged_%s" % + peptide_allele_merge_activation)(current_layer) for (i, layer_size) in enumerate(layer_sizes): - kernel_regularizer = None - l1 = dense_layer_l1_regularization - l2 = dense_layer_l2_regularization - if l1 > 0 or l2 > 0: - kernel_regularizer = keras.regularizers.l1_l2(l1, l2) - current_layer = Dense( layer_size, activation=activation, diff --git a/mhcflurry/common.py b/mhcflurry/common.py index abc2a72c7b7efc1893f2f3630c48b3166169d5ef..942e9feac4064f6076524826bc504211e2d70130 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -2,6 +2,7 @@ from __future__ import print_function, division, absolute_import import collections import logging import sys +import os import numpy import pandas @@ -9,6 +10,34 @@ import pandas from . import amino_acid +def set_keras_backend(backend): + """ + Configure Keras backend to use GPU or CPU. Only tensorflow is supported. + + Must be called before Keras has been imported. + + backend must be 'tensorflow-cpu' or 'tensorflow-gpu'. + """ + os.environ["KERAS_BACKEND"] = "tensorflow" + + if backend == "tensorflow-cpu": + print("Forcing tensorflow/CPU backend.") + os.environ["CUDA_VISIBLE_DEVICES"] = "" + device_count = {'CPU': 1, 'GPU': 0} + elif backend == "tensorflow-gpu": + print("Forcing tensorflow/GPU backend.") + device_count = {'CPU': 0, 'GPU': 1} + else: + raise ValueError("Unsupported backend: %s" % backend) + + import tensorflow + from keras import backend as K + config = tensorflow.ConfigProto( + device_count=device_count) + session = tensorflow.Session(config=config) + K.set_session(session) + + def configure_logging(verbose=False): """ Configure logging module using defaults. diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..cc885e003af6ec5105db900efaf3709df429f88e --- /dev/null +++ b/mhcflurry/custom_loss.py @@ -0,0 +1,96 @@ +""" +Custom loss functions. + +For losses supporting inequalities, each training data point is associated with +one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only +if the prediction is less than the given value. +""" + +import pandas +from numpy import isnan, array + +CUSTOM_LOSSES = {} + + +class MSEWithInequalities(object): + """ + Supports training a regressor on data that includes inequalities + (e.g. x < 100). Mean square error is used as the loss for elements with + an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss + for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise. + + This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a + hack, the implementation uses other intervals for y_pred to encode the + inequality information. + + y_true is interpreted as follows: + + between 0 - 1 + Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if + y_pred is greater or less than y_true. + + between 2 - 3: + Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is + applied only if y_pred is greater than y_true - 2. + + between 4 - 5: + Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is + applied only if y_pred is less than y_true - 4. + """ + name = "mse_with_inequalities" + supports_inequalities = True + + @staticmethod + def encode_y(y, inequalities=None): + y = array(y, dtype="float32") + if isnan(y).any(): + raise ValueError("y contains NaN") + if (y > 1.0).any(): + raise ValueError("y contains values > 1.0") + if (y < 0.0).any(): + raise ValueError("y contains values < 0.0") + + if inequalities is None: + encoded = y + else: + offsets = pandas.Series(inequalities).map({ + '=': 0, + '>': 2, + '<': 4, + }).values + if isnan(offsets).any(): + raise ValueError("Invalid inequality. Must be =, <, or >") + encoded = y + offsets + assert not isnan(encoded).any() + return encoded + + @staticmethod + def loss(y_true, y_pred): + # We always delay import of Keras so that mhcflurry can be imported initially + # without tensorflow debug output, etc. + from keras import backend as K + + # Handle (=) inequalities + diff1 = y_pred - y_true + diff1 *= K.cast(y_true >= 0.0, "float32") + diff1 *= K.cast(y_true <= 1.0, "float32") + + # Handle (>) inequalities + diff2 = y_pred - (y_true - 2.0) + diff2 *= K.cast(y_true >= 2.0, "float32") + diff2 *= K.cast(y_true <= 3.0, "float32") + diff2 *= K.cast(diff2 < 0.0, "float32") + + # Handle (<) inequalities + diff3 = y_pred - (y_true - 4.0) + diff3 *= K.cast(y_true >= 4.0, "float32") + diff3 *= K.cast(diff3 > 0.0, "float32") + + return ( + K.sum(K.square(diff1), axis=-1) + + K.sum(K.square(diff2), axis=-1) + + K.sum(K.square(diff3), axis=-1)) + +# Register custom losses. +for cls in [MSEWithInequalities]: + CUSTOM_LOSSES[cls.name] = cls() \ No newline at end of file diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index e88675ed00b78718f0687e28ed4911473593e13c..b3b7e9d3b78d421af685841f370b5a99885f595c 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -9,7 +9,7 @@ from __future__ import ( ) import logging import yaml -from os.path import join, exists +from os.path import join, exists, relpath from pipes import quote from os import environ from collections import OrderedDict @@ -20,11 +20,14 @@ ENVIRONMENT_VARIABLES = [ "MHCFLURRY_DATA_DIR", "MHCFLURRY_DOWNLOADS_CURRENT_RELEASE", "MHCFLURRY_DOWNLOADS_DIR", + "MHCFLURRY_DEFAULT_CLASS1_MODELS" ] _DOWNLOADS_DIR = None _CURRENT_RELEASE = None _METADATA = None +_MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR = environ.get( + "MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR") def get_downloads_dir(): @@ -51,6 +54,37 @@ def get_downloads_metadata(): return _METADATA +def get_default_class1_models_dir(test_exists=True): + """ + Return the absolute path to the default class1 models dir. + + If environment variable MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR is set to an + absolute path, return that path. If it's set to a relative path (i.e. does + not start with /) then return that path taken to be relative to the mhcflurry + downloads dir. + + If environment variable MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR is NOT set, + then return the path to downloaded models in the "models_class1" download. + + Parameters + ---------- + + test_exists : boolean, optional + Whether to raise an exception of the path does not exist + + Returns + ------- + string : absolute path + """ + if _MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR: + result = join(get_downloads_dir(), _MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR) + if test_exists and not exists(result): + raise IOError("No such directory: %s" % result) + return result + else: + return get_path("models_class1", "models", test_exists=test_exists) + + def get_current_release_downloads(): """ Return a dict of all available downloads in the current release. diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index cf7709b9edf7fb46c1f2ae5e7b23ebb681099380..010aaa17b696fa5dfa026762e5b45b79f11198e5 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -8,7 +8,7 @@ # by name, the downloads with "default=true" are downloaded. # This should usually be the latest release. -current-release: 1.0.0 +current-release: 1.1.0 # An integer indicating what models the current MHCflurry code base is compatible # with. Increment this integer when changes are made to MHCflurry that would break @@ -17,54 +17,89 @@ current-compatibility-version: 2 # Add new releases here as they are made. releases: + 1.1.0: + compatibility-version: 2 + downloads: + - name: models_class1 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/models_class1.20180205.tar.bz2 + default: true + + - name: models_class1_no_mass_spec + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/models_class1_no_mass_spec.20180205.tar.bz2 + default: true + + - name: models_class1_experiments1 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/models_class1_experiments1.tar.bz2 + default: false + + - name: cross_validation_class1 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/cross_validation_class1.tar.bz2 + default: false + + - name: data_iedb + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 + default: false + + - name: data_published + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/data_published.tar.bz2 + default: false + + - name: data_systemhcatlas + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.1/data_systemhcatlas.tar.bz2 + default: false + + - name: data_curated + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.1/data_curated.tar.bz2 + default: true + 1.0.0: compatibility-version: 2 downloads: - name: models_class1 - url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 default: true - name: models_class1_experiments1 - url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 default: false - name: cross_validation_class1 - url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 default: false - name: data_iedb - url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 default: false - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 default: false - name: data_curated - url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 default: true 0.9.2: compatibility-version: 2 downloads: - name: models_class1 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.2/models_class1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.2/models_class1.tar.bz2 default: true - name: data_curated - url: https://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_curated.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/0.9.1/data_curated.tar.bz2 default: true - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 default: false - name: data_iedb - url: https://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_iedb.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/0.9.1/data_iedb.tar.bz2 default: false - name: models_class1_experiments1 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.2/models_class1_experiments1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.2/models_class1_experiments1.tar.bz2 default: false @@ -72,55 +107,55 @@ releases: compatibility-version: 2 downloads: - name: models_class1 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/models_class1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.1/models_class1.tar.bz2 default: true - name: data_curated - url: https://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_curated.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/0.9.1/data_curated.tar.bz2 default: true - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 default: false - name: data_iedb - url: https://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_iedb.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/0.9.1/data_iedb.tar.bz2 default: false - name: models_class1_experiments1 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/models_class1_experiments1.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.9.1/models_class1_experiments1.tar.bz2 default: false 0.2.0: compatibility-version: 1 downloads: - name: models_class1_allele_specific_ensemble - url: http://github.com/hammerlab/mhcflurry/releases/download/0.2.0/models_class1_allele_specific_ensemble.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.2.0/models_class1_allele_specific_ensemble.tar.bz2 default: true - name: models_class1_allele_specific_single - url: http://github.com/hammerlab/mhcflurry/releases/download/0.2.0/models_class1_allele_specific_single.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.2.0/models_class1_allele_specific_single.tar.bz2 default: false - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 default: true - name: data_combined_iedb_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_combined_iedb_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.0.8/data_combined_iedb_kim2014.tar.bz2 default: true 0.0.8: compatibility-version: 1 downloads: - name: models_class1_allele_specific_single - url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/models_class1_allele_specific_single.no_impute.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.0.8/models_class1_allele_specific_single.no_impute.tar.bz2 default: true - name: data_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 default: true - name: data_combined_iedb_kim2014 - url: http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_combined_iedb_kim2014.tar.bz2 + url: http://github.com/openvax/mhcflurry/releases/download/0.0.8/data_combined_iedb_kim2014.tar.bz2 default: true diff --git a/mhcflurry/downloads_command.py b/mhcflurry/downloads_command.py index 08648a99e8eb6d2c03967cfd3ebb0bd12bbdaab8..8dfc5fb667953459e51fcc0ba91659d4c378568f 100644 --- a/mhcflurry/downloads_command.py +++ b/mhcflurry/downloads_command.py @@ -28,6 +28,7 @@ from pipes import quote import errno import tarfile from tempfile import mkstemp +from tqdm import tqdm try: from urllib.request import urlretrieve except ImportError: @@ -120,6 +121,23 @@ def yes_no(boolean): return "YES" if boolean else "NO" +# For progress bar on download. See https://pypi.python.org/pypi/tqdm +class TqdmUpTo(tqdm): + """Provides `update_to(n)` which uses `tqdm.update(delta_n)`.""" + def update_to(self, b=1, bsize=1, tsize=None): + """ + b : int, optional + Number of blocks transferred so far [default: 1]. + bsize : int, optional + Size of each block (in tqdm units) [default: 1]. + tsize : int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) # will also set self.n = b * bsize + + def fetch_subcommand(args): def qprint(msg): if not args.quiet: @@ -167,16 +185,18 @@ def fetch_subcommand(args): yes_no(item in items_to_fetch), info['metadata']["url"])) - # TODO: Add a download progress bar? - # See http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python - # Also may want to extract into somewhere temporary and then rename to - # making an incomplete extract if the process is killed. + # TODO: may want to extract into somewhere temporary and then rename to + # avoid making an incomplete extract if the process is killed. for item in items_to_fetch: metadata = downloads[item]['metadata'] (temp_fd, target_path) = mkstemp() try: qprint("Downloading: %s" % metadata['url']) - urlretrieve(metadata['url'], target_path) + urlretrieve( + metadata['url'], + target_path, + reporthook=TqdmUpTo( + unit='B', unit_scale=True, miniters=1).update_to) qprint("Downloaded to: %s" % quote(target_path)) tar = tarfile.open(target_path, 'r:bz2') @@ -191,7 +211,9 @@ def fetch_subcommand(args): "Archive has suspicious names: %s" % bad_names) result_dir = get_path(item, test_exists=False) os.mkdir(result_dir) - tar.extractall(path=result_dir) + + for member in tqdm(tar.getmembers(), desc='Extracting'): + tar.extractall(path=result_dir, members=[member]) tar.close() qprint("Extracted %d files to: %s" % ( len(names), quote(result_dir))) @@ -229,15 +251,13 @@ def info_subcommand(args): downloads = get_current_release_downloads() - format_string = "%-40s %-12s %-12s %-20s " - print(format_string % ( - "DOWNLOAD NAME", "DOWNLOADED?", "DEFAULT?", "URL")) + format_string = "%-40s %-12s %-20s " + print(format_string % ("DOWNLOAD NAME", "DOWNLOADED?", "URL")) for (item, info) in downloads.items(): print(format_string % ( item, yes_no(info['downloaded']), - yes_no(info['metadata']['default']), info['metadata']["url"])) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index aab2b0fe1874b288033afa4419de1919e29f4e1e..6dcd5e6328009baa6e4e3da4a806449f3fa83cfd 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -104,7 +104,6 @@ class EncodableSequences(object): ------- numpy.array with shape (num sequences, max_length, m) where m is vector_encoding_length(vector_encoding_name) - """ cache_key = ( "fixed_length_vector_encoding", diff --git a/mhcflurry/ensemble_centrality.py b/mhcflurry/ensemble_centrality.py new file mode 100644 index 0000000000000000000000000000000000000000..8a812d7656b40099f8bec67e7a0d5832c9cc677f --- /dev/null +++ b/mhcflurry/ensemble_centrality.py @@ -0,0 +1,37 @@ +""" +Measures of centrality (e.g. mean) used to combine predictions across an +ensemble. The input to these functions are log affinities, and they are expected +to return a centrality measure also in log-space. +""" + +import numpy +from functools import partial + + +def robust_mean(log_values): + """ + Mean of values falling within the 25-75 percentiles. + + Parameters + ---------- + log_values : 2-d numpy.array + Center is computed along the second axis (i.e. per row). + + Returns + ------- + center : numpy.array of length log_values.shape[1] + + """ + if log_values.shape[1] <= 3: + # Too few values to use robust mean. + return numpy.nanmean(log_values, axis=1) + mask = ( + (log_values <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) & + (log_values >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1)))) + return (log_values * mask.astype(float)).sum(1) / mask.sum(1) + +CENTRALITY_MEASURES = { + "mean": partial(numpy.nanmean, axis=1), + "median": partial(numpy.nanmedian, axis=1), + "robust_mean": robust_mean, +} \ No newline at end of file diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index 3eb9f16fe2b6fb7d40e37001ee306bcfd3b5fa11..61f64e4ffd3413f95b7356c7c700fc4a6defa436 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -33,8 +33,9 @@ import logging import pandas -from .downloads import get_path +from .downloads import get_default_class1_models_dir from .class1_affinity_predictor import Class1AffinityPredictor +from .version import __version__ parser = argparse.ArgumentParser( @@ -61,7 +62,11 @@ helper_args.add_argument( default=False, help="Prints the list of supported peptide lengths and exits" ) - +helper_args.add_argument( + "--version", + action="version", + version="mhcflurry %s" % __version__, +) input_args = parser.add_argument_group(title="Required input arguments") input_args.add_argument( @@ -109,6 +114,11 @@ output_args.add_argument( metavar="NAME", default="mhcflurry_", help="Prefix for output column names. Default: '%(default)s'") +output_args.add_argument( + "--output-delimiter", + metavar="CHAR", + default=",", + help="Delimiter character for results. Default: '%(default)s'") model_args = parser.add_argument_group(title="Optional model settings") @@ -117,7 +127,7 @@ model_args.add_argument( metavar="DIR", default=None, help="Directory containing models. " - "Default: %s" % get_path("models_class1", "models", test_exists=False)) + "Default: %s" % get_default_class1_models_dir(test_exists=False)) model_args.add_argument( "--include-individual-model-predictions", action="store_true", @@ -129,12 +139,16 @@ model_args.add_argument( def run(argv=sys.argv[1:]): args = parser.parse_args(argv) + # It's hard to pass a tab in a shell, so we correct a common error: + if args.output_delimiter == "\\t": + args.output_delimiter = "\t" + models_dir = args.models if models_dir is None: # The reason we set the default here instead of in the argument parser is that # we want to test_exists at this point, so the user gets a message instructing # them to download the models if needed. - models_dir = get_path("models_class1", "models") + models_dir = get_default_class1_models_dir(test_exists=True) predictor = Class1AffinityPredictor.load(models_dir) # The following two are informative commands that can come @@ -200,7 +214,7 @@ def run(argv=sys.argv[1:]): df[args.prediction_column_prefix + col] = predictions[col] if args.out: - df.to_csv(args.out, index=False) + df.to_csv(args.out, index=False, sep=args.output_delimiter) print("Wrote: %s" % args.out) else: - df.to_csv(sys.stdout, index=False) + df.to_csv(sys.stdout, index=False, sep=args.output_delimiter) diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index 6602761bf7c6a096d7c23438543f297fb8356bb5..30a8ae6934461631bec68fad5d8c6fcb4467cc07 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -8,13 +8,31 @@ import sys import time import traceback from multiprocessing import Pool +from functools import partial +from pprint import pprint import pandas import yaml from mhcnames import normalize_allele_name +import tqdm # progress bar -from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor -from mhcflurry.common import configure_logging +from .class1_affinity_predictor import Class1AffinityPredictor +from .common import configure_logging, set_keras_backend + + +# To avoid pickling large matrices to send to child processes when running in +# parallel, we use this global variable as a place to store data. Data that is +# stored here before creating the thread pool will be inherited to the child +# processes upon fork() call, allowing us to share large data with the workers +# efficiently. +GLOBAL_DATA = {} + + +# Note on parallelization: +# It seems essential currently (tensorflow==1.4.1) that no processes are forked +# after tensorflow has been used at all, which includes merely importing +# keras.backend. So we must make sure not to use tensorflow in the main process +# if we are running in parallel. parser = argparse.ArgumentParser(usage=__doc__) @@ -52,6 +70,11 @@ parser.add_argument( action="store_true", default=False, help="Use only quantitative training data") +parser.add_argument( + "--ignore-inequalities", + action="store_true", + default=False, + help="Do not use affinity value inequalities even when present in data") parser.add_argument( "--percent-rank-calibration-num-peptides-per-length", type=int, @@ -79,22 +102,33 @@ parser.add_argument( help="Keras verbosity. Default: %(default)s", default=0) parser.add_argument( - "--parallelization-num-jobs", - default=1, + "--num-jobs", + default=[1], type=int, metavar="N", - help="Parallelization jobs. Experimental. Does NOT work with tensorflow. " - "Set to 1 for serial run. Set to 0 to use number of cores. " - "Default: %(default)s.") + nargs="+", + help="Number of processes to parallelize training and percent rank " + "calibration over, respectively. Experimental. If only one value is specified " + "then the same number of jobs is used for both phases." + "Set to 1 for serial run. Set to 0 to use number of cores. Default: %(default)s.") +parser.add_argument( + "--backend", + choices=("tensorflow-gpu", "tensorflow-cpu"), + help="Keras backend. If not specified will use system default.") def run(argv=sys.argv[1:]): + global GLOBAL_DATA + # On sigusr1 print stack trace print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) args = parser.parse_args(argv) + if args.backend: + set_keras_backend(args.backend) + configure_logging(verbose=args.verbosity > 1) hyperparameters_lst = yaml.load(open(args.hyperparameters)) @@ -115,6 +149,10 @@ def run(argv=sys.argv[1:]): ] print("Subselected to quantitative: %s" % (str(df.shape))) + if args.ignore_inequalities and "measurement_inequality" in df.columns: + print("Dropping measurement_inequality column") + del df["measurement_inequality"] + allele_counts = df.allele.value_counts() if args.allele: @@ -130,15 +168,18 @@ def run(argv=sys.argv[1:]): print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) print("Training data: %s" % (str(df.shape))) + GLOBAL_DATA["train_data"] = df + predictor = Class1AffinityPredictor() - if args.parallelization_num_jobs == 1: + if args.num_jobs[0] == 1: # Serial run + print("Running in serial.") worker_pool = None else: worker_pool = Pool( processes=( - args.parallelization_num_jobs - if args.parallelization_num_jobs else None)) + args.num_jobs[0] + if args.num_jobs[0] else None)) print("Using worker pool: %s" % str(worker_pool)) if args.out_models_dir and not os.path.exists(args.out_models_dir): @@ -146,6 +187,8 @@ def run(argv=sys.argv[1:]): os.mkdir(args.out_models_dir) print("Done.") + start = time.time() + work_items = [] for (h, hyperparameters) in enumerate(hyperparameters_lst): n_models = None if 'n_models' in hyperparameters: @@ -158,10 +201,7 @@ def run(argv=sys.argv[1:]): if args.max_epochs: hyperparameters['max_epochs'] = args.max_epochs - work_items = [] - total_data_to_train_on = 0 - for (i, (allele, sub_df)) in enumerate(df.groupby("allele")): - total_data_to_train_on += len(sub_df) * n_models + for (i, allele) in enumerate(df.allele.unique()): for model_group in range(n_models): work_dict = { 'model_group': model_group, @@ -171,7 +211,7 @@ def run(argv=sys.argv[1:]): 'hyperparameter_set_num': h, 'num_hyperparameter_sets': len(hyperparameters_lst), 'allele': allele, - 'data': sub_df, + 'data': None, # subselect from GLOBAL_DATA["train_data"] 'hyperparameters': hyperparameters, 'verbose': args.verbosity, 'predictor': predictor if not worker_pool else None, @@ -179,55 +219,114 @@ def run(argv=sys.argv[1:]): } work_items.append(work_dict) - if worker_pool: - print("Processing %d work items in parallel." % len(work_items)) - predictors = worker_pool.map(work_entrypoint, work_items, chunksize=1) - print("Merging %d predictors fit in parallel." % (len(predictors))) - predictor = Class1AffinityPredictor.merge([predictor] + predictors) - print("Saving merged predictor to: %s" % args.out_models_dir) - predictor.save(args.out_models_dir) - else: - # Run in serial. In this case, every worker is passed the same predictor, - # which it adds models to, so no merging is required. It also saves - # as it goes so no saving is required at the end. - start = time.time() - data_trained_on = 0 - while work_items: - item = work_items.pop(0) - work_predictor = work_entrypoint(item) - assert work_predictor is predictor - - # When running in serial we try to estimate time remaining. - data_trained_on += len(item['data']) - progress = float(data_trained_on) / total_data_to_train_on - time_elapsed = time.time() - start - total_time = time_elapsed / progress - print( - "Estimated total training time: %0.2f min, " - "remaining: %0.2f min" % ( - total_time / 60, - (total_time - time_elapsed) / 60)) + if worker_pool: + print("Processing %d work items in parallel." % len(work_items)) + + # We sort here so the predictors are in order of hyperparameter set num. + # This is convenient so that the neural networks get merged for each + # allele in the same order. + predictors = [ + predictor for (_, predictor) + in sorted( + tqdm.tqdm( + worker_pool.imap_unordered( + train_model_entrypoint, work_items, chunksize=1), + total=len(work_items)), + key=lambda pair: pair[0]) + ] + print("Merging %d predictors fit in parallel." % (len(predictors))) + predictor = Class1AffinityPredictor.merge([predictor] + predictors) + print("Saving merged predictor to: %s" % args.out_models_dir) + predictor.save(args.out_models_dir) + else: + # Run in serial. In this case, every worker is passed the same predictor, + # which it adds models to, so no merging is required. It also saves + # as it goes so no saving is required at the end. + for _ in tqdm.trange(len(work_items)): + item = work_items.pop(0) # want to keep freeing up memory + (_, work_predictor) = train_model_entrypoint(item) + assert work_predictor is predictor + assert not work_items + + print("*" * 30) + training_time = time.time() - start + print("Trained affinity predictor with %d networks in %0.2f min." % ( + len(predictor.neural_networks), training_time / 60.0)) + print("*" * 30) if worker_pool: worker_pool.close() worker_pool.join() + worker_pool = None + start = time.time() if args.percent_rank_calibration_num_peptides_per_length > 0: - start = time.time() - print("Performing percent rank calibration.") - predictor.calibrate_percentile_ranks( + alleles = list(predictor.supported_alleles) + + print("Performing percent rank calibration. Encoding peptides.") + encoded_peptides = predictor.calibrate_percentile_ranks( + alleles=[], # don't actually do any calibration, just return peptides num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length) - print("Finished calibrating percent ranks in %0.2f sec." % ( + + # Now we encode the peptides for each neural network, so the encoding + # becomes cached. + for network in predictor.neural_networks: + network.peptides_to_network_input(encoded_peptides) + assert encoded_peptides.encoding_cache # must have cached the encoding + print("Finished encoding peptides for percent ranks in %0.2f sec." % ( time.time() - start)) + print("Calibrating percent rank calibration for %d alleles." % len(alleles)) + + if args.num_jobs[-1] == 1: + # Serial run + print("Running in serial.") + worker_pool = None + results = ( + calibrate_percentile_ranks( + allele=allele, + predictor=predictor, + peptides=encoded_peptides) + for allele in alleles) + else: + # Parallel run + # Store peptides in global variable so they are in shared memory + # after fork, instead of needing to be pickled. + GLOBAL_DATA["calibration_peptides"] = encoded_peptides + worker_pool = Pool( + processes=( + args.num_jobs[-1] + if args.num_jobs[-1] else None)) + print("Using worker pool: %s" % str(worker_pool)) + results = worker_pool.imap_unordered( + partial( + calibrate_percentile_ranks, + predictor=predictor), + alleles, + chunksize=1) + + for result in tqdm.tqdm(results, total=len(alleles)): + predictor.allele_to_percent_rank_transform.update(result) + + print("Done calibrating %d additional alleles." % len(alleles)) predictor.save(args.out_models_dir, model_names_to_write=[]) + percent_rank_calibration_time = time.time() - start -def work_entrypoint(item): - return process_work(**item) + if worker_pool: + worker_pool.close() + worker_pool.join() + + print("Train time: %0.2f min. Percent rank calibration time: %0.2f min." % ( + training_time / 60.0, percent_rank_calibration_time / 60.0)) + print("Predictor written to: %s" % args.out_models_dir) + + +def train_model_entrypoint(item): + return train_model(**item) -def process_work( +def train_model( model_group, n_models, allele_num, @@ -244,6 +343,10 @@ def process_work( if predictor is None: predictor = Class1AffinityPredictor() + if data is None: + full_data = GLOBAL_DATA["train_data"] + data = full_data.loc[full_data.allele == allele] + progress_preamble = ( "[%2d / %2d hyperparameters] " "[%4d / %4d alleles] " @@ -259,22 +362,42 @@ def process_work( train_data = data.sample(frac=1.0) (model,) = predictor.fit_allele_specific_predictors( n_models=1, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=train_data.peptide.values, affinities=train_data.measurement_value.values, + inequalities=( + train_data.measurement_inequality.values + if "measurement_inequality" in train_data.columns else None), models_dir_for_save=save_to, progress_preamble=progress_preamble, verbose=verbose) if allele_num == 0 and model_group == 0: # For the first model for the first allele, print the architecture. + print("*** HYPERPARAMETER SET %d***" % + (hyperparameter_set_num + 1)) + pprint(hyperparameters) print("*** ARCHITECTURE FOR HYPERPARAMETER SET %d***" % (hyperparameter_set_num + 1)) model.network(borrow=True).summary() - return predictor - + return (hyperparameter_set_num, predictor) + + +def calibrate_percentile_ranks(allele, predictor, peptides=None): + """ + Private helper function. + """ + global GLOBAL_DATA + if peptides is None: + peptides = GLOBAL_DATA["calibration_peptides"] + predictor.calibrate_percentile_ranks( + peptides=peptides, + alleles=[allele]) + return { + allele: predictor.allele_to_percent_rank_transform[allele], + } if __name__ == '__main__': diff --git a/mhcflurry/version.py b/mhcflurry/version.py new file mode 100644 index 0000000000000000000000000000000000000000..6849410aae0a8010e76d5f0a44ced13d750b0989 --- /dev/null +++ b/mhcflurry/version.py @@ -0,0 +1 @@ +__version__ = "1.1.0" diff --git a/requirements.txt b/requirements.txt index 746d97f072e3bc272758b9eb35c82e2e2f7040d0..da0ceda3a648a43ba10b5d6b82c760e5152c1574 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ appdirs scikit-learn mhcnames pyyaml +tqdm diff --git a/setup.py b/setup.py index 0a2f0e49f105d8b18278eb7031ec1e8d163c32ba..9076caf1e6b6dc55053783d966cb3c9b5d68d912 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,10 @@ try: import pypandoc readme = pypandoc.convert(readme, to='rst', format='md') except: - logging.warn("Conversion of long_description from MD to RST failed") + logging.warning("Conversion of long_description from MD to RST failed") pass -with open('mhcflurry/__init__.py', 'r') as f: +with open('mhcflurry/version.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), @@ -57,6 +57,7 @@ if __name__ == '__main__': 'scikit-learn', 'mhcnames', 'pyyaml', + 'tqdm', ] if PY2: # concurrent.futures is a standard library in Py3 but Py2 diff --git a/test/test_class1_affinity_predictor.py b/test/test_class1_affinity_predictor.py index 65f1f587316b9c56c1c30dfa0ff4c828c88c9648..40ac8c2f9ea22b1b6ad80495da061bd6b0acf5c5 100644 --- a/test/test_class1_affinity_predictor.py +++ b/test/test_class1_affinity_predictor.py @@ -51,7 +51,7 @@ def test_a1_known_epitopes_in_newly_trained_model(): allele = "HLA-A*01:01" df = pandas.read_csv( get_path( - "data_curated", "curated_training_data.csv.bz2")) + "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) df = df.ix[ (df.allele == allele) & (df.peptide.str.len() >= 8) & @@ -92,10 +92,11 @@ def test_a1_known_epitopes_in_newly_trained_model(): predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, + verbose=0, ) predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor) @@ -136,7 +137,7 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): df = pandas.read_csv( get_path( - "data_curated", "curated_training_data.csv.bz2")) + "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) df = df.ix[ df.allele == allele ] @@ -153,10 +154,11 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, + verbose=0, ) predictor.calibrate_percentile_ranks(num_peptides_per_length=1000) ic50_pred = predictor.predict(df.peptide.values, allele=allele) diff --git a/test/test_class1_neural_network.py b/test/test_class1_neural_network.py index 835c632d7c2c4dc6cbe1f454bc8e04ee754229d2..b082f28cf89276bf842cc427d98ad7defab0985a 100644 --- a/test/test_class1_neural_network.py +++ b/test/test_class1_neural_network.py @@ -1,16 +1,20 @@ +from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal + import numpy import pandas -numpy.random.seed(0) +from numpy import testing -from mhcflurry.class1_neural_network import Class1NeuralNetwork +numpy.random.seed(0) -from nose.tools import eq_ -from numpy import testing +import logging +logging.getLogger('tensorflow').disabled = True +from mhcflurry.class1_neural_network import Class1NeuralNetwork from mhcflurry.downloads import get_path +from mhcflurry.common import random_peptides -def test_class1_affinity_predictor_a0205_training_accuracy(): +def test_class1_neural_network_a0205_training_accuracy(): # Memorize the dataset. hyperparameters = dict( activation="tanh", @@ -33,7 +37,7 @@ def test_class1_affinity_predictor_a0205_training_accuracy(): df = pandas.read_csv( get_path( - "data_curated", "curated_training_data.csv.bz2")) + "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) df = df.ix[ df.allele == allele ] @@ -77,6 +81,109 @@ def test_class1_affinity_predictor_a0205_training_accuracy(): dense_layer_l1_regularization=0.0, dropout_probability=0.0) predictor2 = Class1NeuralNetwork(**hyperparameters2) - predictor2.fit(df.peptide.values, df.measurement_value.values) + predictor2.fit(df.peptide.values, df.measurement_value.values, verbose=0) eq_(predictor.network().to_json(), predictor2.network().to_json()) + +def test_inequalities(): + # Memorize the dataset. + hyperparameters = dict( + loss="custom:mse_with_inequalities", + activation="tanh", + layer_sizes=[16], + max_epochs=50, + early_stopping=False, + validation_split=0.0, + locally_connected_layers=[ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } + ], + dense_layer_l1_regularization=0.0, + dropout_probability=0.0) + + df = pandas.DataFrame() + df["peptide"] = random_peptides(1000, length=9) + + # First half are binders + df["binder"] = df.index < len(df) / 2 + df["value"] = df.binder.map({True: 100, False: 5000}) + df.loc[:10, "value"] = 1.0 # some strong binders + df["inequality1"] = "=" + df["inequality2"] = df.binder.map({True: "<", False: "="}) + df["inequality3"] = df.binder.map({True: "=", False: ">"}) + + # "A" at start of peptide indicates strong binder + df["peptide"] = [ + ("C" if not row.binder else "A") + row.peptide[1:] + for _, row in df.iterrows() + ] + + fit_kwargs = {'verbose': 0} + + # Prediction1 uses no inequalities (i.e. all are (=)) + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality1.values, + **fit_kwargs) + df["prediction1"] = predictor.predict(df.peptide.values) + + # Prediction2 has a (<) inequality on binders and an (=) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality2.values, + **fit_kwargs) + df["prediction2"] = predictor.predict(df.peptide.values) + + # Prediction3 has a (=) inequality on binders and an (>) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality3.values, + **fit_kwargs) + df["prediction3"] = predictor.predict(df.peptide.values) + + df_binders = df.loc[df.binder] + df_nonbinders = df.loc[~df.binder] + + print("***** Binders: *****") + print(df_binders.head(5)) + + print("***** Non-binders: *****") + print(df_nonbinders.head(5)) + + # Binders should always be given tighter predicted affinity than non-binders + assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean()) + assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean()) + assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean()) + + # prediction2 binders should be tighter on average than prediction1 + # binders, since prediction2 has a (<) inequality for binders. + # Non-binders should be about the same between prediction2 and prediction1 + assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean()) + assert_almost_equal( + df_nonbinders.prediction2.mean(), + df_nonbinders.prediction1.mean(), + delta=3000) + + # prediction3 non-binders should be weaker on average than prediction2 (or 1) + # non-binders, since prediction3 has a (>) inequality for these peptides. + # Binders should be about the same. + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction2.mean()) + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction1.mean()) + assert_almost_equal( + df_binders.prediction3.mean(), + df_binders.prediction1.mean(), + delta=3000) + diff --git a/test/test_custom_loss.py b/test/test_custom_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f3644c66652136ca7c9f05299f1a20e6995fc8fd --- /dev/null +++ b/test/test_custom_loss.py @@ -0,0 +1,72 @@ +from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal + +import numpy + +numpy.random.seed(0) + +import logging +logging.getLogger('tensorflow').disabled = True + +import keras.backend as K + +from mhcflurry.custom_loss import CUSTOM_LOSSES + + +def evaluate_loss(loss, y_true, y_pred): + if K.backend() == "tensorflow": + session = K.get_session() + y_true_var = K.constant(y_true, name="y_true") + y_pred_var = K.constant(y_pred, name="y_pred") + result = loss(y_true_var, y_pred_var) + return result.eval(session=session) + elif K.backend() == "theano": + y_true_var = K.constant(y_true, name="y_true") + y_pred_var = K.constant(y_pred, name="y_pred") + result = loss(y_true_var, y_pred_var) + return result.eval() + else: + raise ValueError("Unsupported backend: %s" % K.backend()) + + +def test_mse_with_inequalities(): + + loss_obj = CUSTOM_LOSSES['mse_with_inequalities'] + + y_values = [0.0, 0.5, 0.8, 1.0] + + adjusted_y = loss_obj.encode_y(y_values) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, [">", ">", ">", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, ["<", "<", "<", "<"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, ["=", "<", "=", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, ["=", "<", "=", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.0, 0.4, 0.8, 1.0]) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, [">", "<", ">", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.4, 0.9, 1.0]) + eq_(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, [">", "<", ">", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0]) + assert_greater(loss0, 0.0) + + adjusted_y = loss_obj.encode_y(y_values, ["=", "<", ">", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0]) + assert_almost_equal(loss0, 0.02) + + adjusted_y = loss_obj.encode_y(y_values, ["=", "<", "=", ">"]) + loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0]) + assert_almost_equal(loss0, 0.03) + diff --git a/test/test_train_allele_specific_models_command.py b/test/test_train_allele_specific_models_command.py index affcde7569b0ae5766ae554f50a73122f3e3894f..12fcaa3d07614804ff004ee6110c5587163bb70d 100644 --- a/test/test_train_allele_specific_models_command.py +++ b/test/test_train_allele_specific_models_command.py @@ -57,12 +57,13 @@ def run_and_check(n_jobs=0): json.dump(HYPERPARAMETERS, fd) args = [ - "--data", get_path("data_curated", "curated_training_data.csv.bz2"), + "--data", get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2"), "--hyperparameters", hyperparameters_filename, "--allele", "HLA-A*02:01", "HLA-A*01:01", "HLA-A*03:01", "--out-models-dir", models_dir, "--percent-rank-calibration-num-peptides-per-length", "10000", - "--parallelization-num-jobs", str(n_jobs), + "--num-jobs", str(n_jobs), + "--ignore-inequalities", ] print("Running with args: %s" % args) train_allele_specific_models_command.run(args) @@ -82,6 +83,7 @@ def run_and_check(n_jobs=0): print("Deleting: %s" % models_dir) shutil.rmtree(models_dir) + def Xtest_run_parallel(): run_and_check(n_jobs=3)