Skip to content
Snippets Groups Projects
Unverified Commit bbcfc2ba authored by Tim O'Donnell's avatar Tim O'Donnell Committed by GitHub
Browse files

Merge pull request #114 from hammerlab/2017-11-13

Implement BLOSUM62 encoding
parents 1c3da71b ed9372c2
No related merge requests found
......@@ -22,7 +22,7 @@ If you find MHCflurry useful in your research please cite:
> O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC binding affinity prediction. bioRxiv. Available at: http://www.biorxiv.org/content/early/2017/08/09/174243.
## Setup
## Setup (pip)
Install the package:
......@@ -56,6 +56,32 @@ export KERAS_BACKEND=theano
You may also needs to `pip install theano`.
## Setup (conda)
You can alternatively get up and running with a [conda](https://conda.io/docs/)
environment as follows:
```
conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.0'
source activate mhcflurry-env
```
Then continue as above:
```
pip install mhcflurry
mhcflurry-downloads fetch
```
If you wish to test your installation, you can install `nose` and run the tests
from a checkout:
```
pip install nose
nosetests .
```
## Making predictions from the command-line
......@@ -103,7 +129,7 @@ notebook for an overview of the Python API, including fitting your own predictor
An ensemble of eight single-allele models was trained for each allele with at least
100 measurements in the training set (118 alleles). The models were trained on a
random 80% sample of the data for the allele and the remaining 20% was used for
early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.json). The
early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.yaml). The
predictions are taken to be the geometric mean of the nM binding affinity
predictions of the individual models. The training script is [here](downloads-generation/models_class1/GENERATE.sh).
......
......@@ -25,11 +25,11 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models
cp $SCRIPT_DIR/hyperparameters.json .
cp $SCRIPT_DIR/hyperparameters.yaml .
time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--hyperparameters hyperparameters.json \
--hyperparameters hyperparameters.yaml \
--out-models-dir models \
--min-measurements-per-allele 200
......
[
{
"n_models": 12,
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"use_embedding": false,
"kmer_size": 15,
"batch_normalization": false,
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
},
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
],
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
"dense_layer_l1_regularization": 0.001,
"dropout_probability": 0.0
}
]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 12,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
},
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
......@@ -17,7 +17,7 @@ from .class1_affinity_prediction.class1_neural_network import (
from .class1_affinity_prediction.class1_affinity_predictor import (
Class1AffinityPredictor)
__version__ = "0.9.2"
__version__ = "0.9.3"
__all__ = [
"Class1NeuralNetwork",
......
......@@ -9,14 +9,17 @@ import keras.models
import keras.layers.pooling
import keras.regularizers
from keras.layers import Input
import keras.layers.merge
import keras.layers
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from mhcflurry.hyperparameters import HyperparameterDefaults
from ..encodable_sequences import EncodableSequences
from ..encodable_sequences import (
EncodableSequences,
available_vector_encodings,
vector_encoding_length)
from ..regression_target import to_ic50, from_ic50
from ..common import random_peptides, amino_acid_distribution
......@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults = HyperparameterDefaults(
kmer_size=15,
use_embedding=False,
peptide_amino_acid_encoding="one-hot",
embedding_input_dim=21,
embedding_output_dim=8,
pseudosequence_use_embedding=False,
......@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
numpy.array
"""
encoder = EncodableSequences.create(peptides)
if self.hyperparameters['use_embedding']:
if (self.hyperparameters['use_embedding'] or
self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
encoded = encoder.variable_length_to_fixed_length_categorical(
max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters))
else:
encoded = encoder.variable_length_to_fixed_length_one_hot(
elif (
self.hyperparameters['peptide_amino_acid_encoding'] in
available_vector_encodings()):
encoded = encoder.variable_length_to_fixed_length_vector_encoding(
self.hyperparameters['peptide_amino_acid_encoding'],
max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters))
else:
raise ValueError("Unsupported peptide_amino_acid_encoding: %s" %
self.hyperparameters['peptide_amino_acid_encoding'])
assert len(encoded) == len(peptides)
return encoded
@property
def supported_peptide_lengths(self):
"""
......@@ -302,7 +312,8 @@ class Class1NeuralNetwork(object):
if self.hyperparameters['pseudosequence_use_embedding']:
encoded = encoder.fixed_length_categorical()
else:
encoded = encoder.fixed_length_one_hot()
raise NotImplementedError
# encoded = encoder.fixed_length_one_hot()
assert len(encoded) == len(pseudosequences)
return encoded
......@@ -492,7 +503,8 @@ class Class1NeuralNetwork(object):
pseudosequences_input = self.pseudosequence_to_network_input(
allele_pseudosequences)
x_dict['pseudosequence'] = pseudosequences_input
(predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T
(predictions,) = numpy.array(
self.network(borrow=True).predict(x_dict), dtype="float64").T
return to_ic50(predictions)
def compile(self):
......@@ -507,6 +519,7 @@ class Class1NeuralNetwork(object):
def make_network(
pseudosequence_length,
kmer_size,
peptide_amino_acid_encoding,
use_embedding,
embedding_input_dim,
embedding_output_dim,
......@@ -524,7 +537,7 @@ class Class1NeuralNetwork(object):
"""
Helper function to make a keras network for class1 affinity prediction.
"""
if use_embedding:
if use_embedding or peptide_amino_acid_encoding == "embedding":
peptide_input = Input(
shape=(kmer_size,), dtype='int32', name='peptide')
current_layer = Embedding(
......@@ -535,7 +548,11 @@ class Class1NeuralNetwork(object):
name="peptide_embedding")(peptide_input)
else:
peptide_input = Input(
shape=(kmer_size, 21), dtype='float32', name='peptide')
shape=(
kmer_size,
vector_encoding_length(peptide_amino_acid_encoding)),
dtype='float32',
name='peptide')
current_layer = peptide_input
inputs = [peptide_input]
......@@ -609,4 +626,7 @@ class Class1NeuralNetwork(object):
inputs=inputs,
outputs=[output],
name="predictor")
print("*** ARCHITECTURE ***")
model.summary()
return model
......@@ -5,7 +5,7 @@ Train Class1 single allele models.
import os
import sys
import argparse
import json
import yaml
import pandas
......@@ -31,7 +31,7 @@ parser.add_argument(
"--hyperparameters",
metavar="FILE.json",
required=True,
help="JSON of hyperparameters")
help="JSON or YAML of hyperparameters")
parser.add_argument(
"--allele",
default=None,
......@@ -61,7 +61,7 @@ def run(argv=sys.argv[1:]):
configure_logging(verbose=args.verbosity > 1)
hyperparameters_lst = json.load(open(args.hyperparameters))
hyperparameters_lst = yaml.load(open(args.hyperparameters))
assert isinstance(hyperparameters_lst, list)
print("Loaded hyperparameters list: %s" % str(hyperparameters_lst))
......
......@@ -22,11 +22,75 @@ import math
import pandas
import numpy
from six import StringIO
import typechecks
from . import amino_acid
AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_table(StringIO("""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
ENCODING_DFS = {
"BLOSUM62": BLOSUM62_MATRIX,
"one-hot": pandas.DataFrame([
[1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
for j in range(len(AMINO_ACIDS))
], index=AMINO_ACIDS, columns=AMINO_ACIDS)
}
def available_vector_encodings():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return list(ENCODING_DFS)
def vector_encoding_length(name):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return ENCODING_DFS[name].shape[1]
def index_encoding(sequences, letter_to_index_dict):
"""
......@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
return result.values
def one_hot_encoding(index_encoded, alphabet_size):
def fixed_vectors_encoding(sequences, letter_to_vector_function):
"""
Given an n * k array of integers in the range [0, alphabet_size), return
an n * k * alphabet_size array where element (i, k, j) is 1 if element
(i, k) == j in the input array and zero otherwise.
Given a sequence of n strings all of length k, return a n * k * m array where
the (i, j)th element is letter_to_vector_function(sequence[i][j]).
Parameters
----------
index_encoded : numpy.array of integers with shape (n, k)
alphabet_size : int
sequences : list of length n of strings of length k
letter_to_vector_function : function : string -> vector of length m
Returns
-------
numpy.array of integers of shape (n, k, alphabet_size)
numpy.array of integers with shape (n, k, m)
"""
alphabet_size = int(alphabet_size)
(num_sequences, sequence_length) = index_encoded.shape
result = numpy.zeros(
(num_sequences, sequence_length, alphabet_size),
dtype='int32')
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices = (
index_encoded +
(
sequence_length * alphabet_size * numpy.arange(num_sequences)
).reshape((-1, 1)) +
numpy.tile(numpy.arange(sequence_length),
(num_sequences, 1)) * alphabet_size)
result.put(flattened_indices, 1)
arr = numpy.array([list(s) for s in sequences])
result = numpy.vectorize(
letter_to_vector_function, signature='()->(n)')(arr)
return result
......@@ -114,40 +163,6 @@ class EncodableSequences(object):
def __len__(self):
return len(self.sequences)
def fixed_length_categorical(self):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("categorical",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
self.encoding_cache[cache_key] = index_encoding(
self.sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def fixed_length_one_hot(self):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("one_hot",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
encoded = self.categorical_encoding()
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_categorical(
self, left_edge=4, right_edge=4, max_length=15):
"""
......@@ -187,8 +202,8 @@ class EncodableSequences(object):
fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_one_hot(
self, left_edge=4, right_edge=4, max_length=15):
def variable_length_to_fixed_length_vector_encoding(
self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
......@@ -198,35 +213,43 @@ class EncodableSequences(object):
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
binary numpy.array with shape (num sequences, max_length, 21)
"""
numpy.array with shape (num sequences, max_length, m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key = (
"fixed_length_one_hot",
"fixed_length_vector_encoding",
vector_encoding_name,
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
encoded = self.variable_length_to_fixed_length_categorical(
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
assert result.shape == (
len(self.sequences),
encoded.shape[1],
len(amino_acid.AMINO_ACID_INDEX))
fixed_length_sequences = [
self.sequence_to_fixed_length_string(
sequence,
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
for sequence in self.sequences
]
result = fixed_vectors_encoding(
fixed_length_sequences,
ENCODING_DFS[vector_encoding_name].loc.__getitem__)
assert result.shape[0] == len(self.sequences)
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@classmethod
def sequence_to_fixed_length_string(
klass, sequence, left_edge=4, right_edge=4, max_length=15):
......
......@@ -30,14 +30,14 @@ try:
with open(readme_filename, 'r') as f:
readme = f.read()
except:
logging.warn("Failed to load %s" % readme_filename)
logging.warning("Failed to load %s" % readme_filename)
readme = ""
try:
import pypandoc
readme = pypandoc.convert(readme, to='rst', format='md')
except:
logging.warn("Conversion of long_description from MD to RST failed")
logging.warning("Conversion of long_description from MD to RST failed")
pass
with open('mhcflurry/__init__.py', 'r') as f:
......@@ -51,12 +51,13 @@ if __name__ == '__main__':
'six',
'numpy>=1.11',
'pandas>=0.13.1',
'Keras==2.0.8',
'Keras==2.0.9',
'appdirs',
'tensorflow',
'tensorflow>=1.1.0',
'scikit-learn',
'typechecks',
'mhcnames',
'pyyaml',
]
if PY2:
# concurrent.futures is a standard library in Py3 but Py2
......
from mhcflurry import encodable_sequences
from nose.tools import eq_
from numpy.testing import assert_equal
import numpy
letter_to_index_dict = {
'A': 0,
......@@ -18,7 +19,13 @@ def test_index_and_one_hot_encoding():
[0, 0, 0, 0],
[0, 1, 2, 0],
])
one_hot = encodable_sequences.one_hot_encoding(index_encoding, 3)
one_hot = encodable_sequences.fixed_vectors_encoding(
index_encoding,
{
0: numpy.array([1, 0, 0]),
1: numpy.array([0, 1, 0]),
2: numpy.array([0, 0, 1]),
}.get)
eq_(one_hot.shape, (2, 4, 3))
assert_equal(
one_hot[0],
......
......@@ -53,7 +53,7 @@ def test_run():
try:
models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
hyperparameters_filename = os.path.join(
models_dir, "hyperparameters.json")
models_dir, "hyperparameters.yaml")
with open(hyperparameters_filename, "w") as fd:
json.dump(HYPERPARAMETERS, fd)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment