Skip to content
Snippets Groups Projects
Commit c335946f authored by Patrick Skillman-Lawrence's avatar Patrick Skillman-Lawrence
Browse files

Deleted mhcflurry/__init__.py, mhcflurry/allele_encoding.py,...

Deleted mhcflurry/__init__.py, mhcflurry/allele_encoding.py, mhcflurry/amino_acid.py, mhcflurry/calibrate_percentile_ranks_command.py, mhcflurry/class1_affinity_predictor.py, mhcflurry/class1_neural_network.py, mhcflurry/class1_presentation_predictor.py, mhcflurry/class1_processing_neural_network.py, mhcflurry/class1_processing_predictor.py, mhcflurry/cluster_parallelism.py, mhcflurry/common.py, mhcflurry/custom_loss.py, mhcflurry/data_dependent_weights_initialization.py, mhcflurry/downloads.py, mhcflurry/downloads.yml, mhcflurry/downloads_command.py, mhcflurry/encodable_sequences.py, mhcflurry/ensemble_centrality.py, mhcflurry/fasta.py, mhcflurry/flanking_encoding.py, mhcflurry/hyperparameters.py, mhcflurry/local_parallelism.py, mhcflurry/percent_rank_transform.py, mhcflurry/predict_command.py, mhcflurry/predict_scan_command.py, mhcflurry/random_negative_peptides.py, mhcflurry/regression_target.py, mhcflurry/scoring.py, mhcflurry/select_allele_specific_models_command.py, mhcflurry/select_pan_allele_models_command.py, mhcflurry/select_processing_models_command.py, mhcflurry/testing_utils.py, mhcflurry/train_allele_specific_models_command.py, mhcflurry/train_pan_allele_models_command.py, mhcflurry/train_presentation_models_command.py, mhcflurry/train_processing_models_command.py, mhcflurry/version.py, train_APmodels.sh files
parent a921e8eb
Branches
Tags
No related merge requests found
Showing
with 0 additions and 9292 deletions
"""
Class I MHC ligand prediction package
"""
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_neural_network import Class1NeuralNetwork
from .class1_processing_predictor import Class1ProcessingPredictor
from .class1_processing_neural_network import Class1ProcessingNeuralNetwork
from .class1_presentation_predictor import Class1PresentationPredictor
from .version import __version__
__all__ = [
"__version__",
"Class1AffinityPredictor",
"Class1NeuralNetwork",
"Class1ProcessingPredictor",
"Class1ProcessingNeuralNetwork",
"Class1PresentationPredictor",
]
import pandas
from . import amino_acid
class AlleleEncoding(object):
def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None):
"""
A place to cache encodings for a sequence of alleles.
We frequently work with alleles by integer indices, for example as
inputs to neural networks. This class is used to map allele names to
integer indices in a consistent way by keeping track of the universe
of alleles under use, i.e. a distinction is made between the universe
of supported alleles (what's in `allele_to_sequence`) and the actual
set of alleles used for some task (what's in `alleles`).
Parameters
----------
alleles : list of string
Allele names. If any allele is None instead of string, it will be
mapped to the special index value -1.
allele_to_sequence : dict of str -> str
Allele name to amino acid sequence
borrow_from : AlleleEncoding, optional
If specified, do not specify allele_to_sequence. The sequences from
the provided instance are used. This guarantees that the mappings
from allele to index and from allele to sequence are the same
between the instances.
"""
if alleles is not None:
alleles = pandas.Series(alleles)
self.borrow_from = borrow_from
self.allele_to_sequence = allele_to_sequence
if self.borrow_from is None:
assert allele_to_sequence is not None
all_alleles = (
sorted(allele_to_sequence))
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate([None] + all_alleles))
unpadded = pandas.Series([
allele_to_sequence[a] if a is not None else ""
for a in [None] + all_alleles
],
index=[None] + all_alleles)
self.sequences = unpadded.str.pad(
unpadded.str.len().max(), fillchar="X")
else:
assert allele_to_sequence is None
self.allele_to_index = borrow_from.allele_to_index
self.sequences = borrow_from.sequences
self.allele_to_sequence = borrow_from.allele_to_sequence
if alleles is not None:
assert all(
allele in self.allele_to_index for allele in alleles),\
"Missing alleles: " + " ".join(set(
a for a in alleles if a not in self.allele_to_index))
self.indices = alleles.map(self.allele_to_index)
assert not self.indices.isnull().any()
self.alleles = alleles
else:
self.indices = None
self.alleles = None
self.encoding_cache = {}
def compact(self):
"""
Return a new AlleleEncoding in which the universe of supported alleles
is only the alleles actually used.
Returns
-------
AlleleEncoding
"""
return AlleleEncoding(
alleles=self.alleles,
allele_to_sequence=dict(
(allele, self.allele_to_sequence[allele])
for allele in self.alleles.unique()
if allele is not None))
def allele_representations(self, encoding_name):
"""
Encode the universe of supported allele sequences to a matrix.
Parameters
----------
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
Returns
-------
numpy.array of shape
(num alleles in universe, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
if self.borrow_from is not None:
return self.borrow_from.allele_representations(encoding_name)
cache_key = (
"allele_representations",
encoding_name)
if cache_key not in self.encoding_cache:
index_encoded_matrix = amino_acid.index_encoding(
self.sequences.values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[encoding_name])
self.encoding_cache[cache_key] = vector_encoded
return self.encoding_cache[cache_key]
def fixed_length_vector_encoded_sequences(self, encoding_name):
"""
Encode allele sequences (not the universe of alleles) to a matrix.
Parameters
----------
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
Returns
-------
numpy.array with shape:
(num alleles, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
cache_key = (
"fixed_length_vector_encoding",
encoding_name)
if cache_key not in self.encoding_cache:
vector_encoded = self.allele_representations(encoding_name)
result = vector_encoded[self.indices]
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
"""
Functions for encoding fixed length sequences of amino acids into various
vector representations, such as one-hot and BLOSUM62.
"""
from __future__ import (
print_function,
division,
absolute_import,
)
import collections
from copy import copy
import pandas
from six import StringIO
COMMON_AMINO_ACIDS = collections.OrderedDict(sorted({
"A": "Alanine",
"R": "Arginine",
"N": "Asparagine",
"D": "Aspartic Acid",
"C": "Cysteine",
"E": "Glutamic Acid",
"Q": "Glutamine",
"G": "Glycine",
"H": "Histidine",
"I": "Isoleucine",
"L": "Leucine",
"K": "Lysine",
"M": "Methionine",
"F": "Phenylalanine",
"P": "Proline",
"S": "Serine",
"T": "Threonine",
"W": "Tryptophan",
"Y": "Tyrosine",
"V": "Valine",
}.items()))
COMMON_AMINO_ACIDS_WITH_UNKNOWN = copy(COMMON_AMINO_ACIDS)
COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
AMINO_ACID_INDEX = dict(
(letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
for (letter, i) in list(AMINO_ACID_INDEX.items()):
AMINO_ACID_INDEX[letter.lower()] = i # Support lower-case as well.
AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_csv(StringIO("""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS].astype("int8")
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
ENCODING_DATA_FRAMES = {
"BLOSUM62": BLOSUM62_MATRIX,
"one-hot": pandas.DataFrame([
[1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
for j in range(len(AMINO_ACIDS))
], index=AMINO_ACIDS, columns=AMINO_ACIDS)
}
def available_vector_encodings():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return list(ENCODING_DATA_FRAMES)
def vector_encoding_length(name):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return ENCODING_DATA_FRAMES[name].shape[1]
def index_encoding(sequences, letter_to_index_dict):
"""
Encode a sequence of same-length strings to a matrix of integers of the
same shape. The map from characters to integers is given by
`letter_to_index_dict`.
Given a sequence of `n` strings all of length `k`, return a `k * n` array where
the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`.
Parameters
----------
sequences : list of length n of strings of length k
letter_to_index_dict : dict : string -> int
Returns
-------
numpy.array of integers with shape (`k`, `n`)
"""
df = pandas.DataFrame(iter(s) for s in sequences)
result = df.replace(letter_to_index_dict)
return result.values
def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
"""
Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and
a dataframe mapping each index to an arbitrary vector, return a `n * k * m`
array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`.
The dataframe index and columns names are ignored here; the indexing is done
entirely by integer position in the dataframe.
Parameters
----------
index_encoded_sequences : `n` x `k` array of integers
letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`)
Returns
-------
numpy.array of integers with shape (`n`, `k`, `m`)
"""
(num_sequences, sequence_length) = index_encoded_sequences.shape
target_shape = (
num_sequences, sequence_length, letter_to_vector_df.shape[0])
result = letter_to_vector_df.iloc[
index_encoded_sequences.reshape((-1,)) # reshape() avoids copy
].values.reshape(target_shape)
return result
"""
Calibrate percentile ranks for models. Runs in-place.
"""
import argparse
import os
import signal
import sys
import time
import traceback
import collections
from functools import partial
import pandas
import numpy
from mhcnames import normalize_allele_name
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .encodable_sequences import EncodableSequences
from .common import configure_logging, random_peptides, amino_acid_distribution
from .local_parallelism import (
add_local_parallelism_args,
worker_pool_with_gpu_assignments_from_args,
call_wrapped_kwargs)
from .cluster_parallelism import (
add_cluster_parallelism_args,
cluster_results_from_args)
# To avoid pickling large matrices to send to child processes when running in
# parallel, we use this global variable as a place to store data. Data that is
# stored here before creating the thread pool will be inherited to the child
# processes upon fork() call, allowing us to share large data with the workers
# via shared memory.
GLOBAL_DATA = {}
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--predictor-kind",
choices=("class1_affinity", "class1_presentation"),
default="class1_affinity",
help="Type of predictor to calibrate")
parser.add_argument(
"--models-dir",
metavar="DIR",
required=True,
help="Directory to read and write models")
parser.add_argument(
"--allele",
default=None,
nargs="+",
help="Alleles to calibrate percentile ranks for. If not specified all "
"alleles are used")
parser.add_argument(
"--match-amino-acid-distribution-data",
help="Sample random peptides from the amino acid distribution of the "
"peptides listed in the supplied CSV file, which must have a 'peptide' "
"column. If not specified a uniform distribution is used.")
parser.add_argument(
"--alleles-file",
default=None,
help="Use alleles in supplied CSV file, which must have an 'allele' column.")
parser.add_argument(
"--num-peptides-per-length",
type=int,
metavar="N",
default=int(1e5),
help="Number of peptides per length to use to calibrate percent ranks. "
"Default: %(default)s.")
parser.add_argument(
"--num-genotypes",
type=int,
metavar="N",
default=25,
help="Used when calibrrating a presentation predictor. Number of genotypes"
"to sample")
parser.add_argument(
"--alleles-per-genotype",
type=int,
metavar="N",
default=6,
help="Used when calibrating a presentation predictor. Number of alleles "
"per genotype. Use 1 to calibrate for single alleles. Default: %(default)s")
parser.add_argument(
"--motif-summary",
default=False,
action="store_true",
help="Calculate motifs and length preferences for each allele")
parser.add_argument(
"--summary-top-peptide-fraction",
default=[0.0001, 0.001, 0.01, 0.1, 1.0],
nargs="+",
type=float,
metavar="X",
help="The top X fraction of predictions (i.e. tightest binders) to use to "
"generate motifs and length preferences. Default: %(default)s")
parser.add_argument(
"--length-range",
default=(8, 15),
type=int,
nargs=2,
help="Min and max peptide length to calibrate, inclusive. "
"Default: %(default)s")
parser.add_argument(
"--prediction-batch-size",
type=int,
default=4096,
help="Keras batch size for predictions")
parser.add_argument(
"--alleles-per-work-chunk",
type=int,
metavar="N",
default=1,
help="Number of alleles per work chunk. Default: %(default)s.")
parser.add_argument(
"--verbosity",
type=int,
help="Keras verbosity. Default: %(default)s",
default=0)
add_local_parallelism_args(parser)
add_cluster_parallelism_args(parser)
def run(argv=sys.argv[1:]):
global GLOBAL_DATA
# On sigusr1 print stack trace
print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
args = parser.parse_args(argv)
args.models_dir = os.path.abspath(args.models_dir)
configure_logging(verbose=args.verbosity > 1)
aa_distribution = None
if args.match_amino_acid_distribution_data:
distribution_peptides = pandas.read_csv(
args.match_amino_acid_distribution_data).peptide.unique()
aa_distribution = amino_acid_distribution(distribution_peptides)
print("Using amino acid distribution:")
print(aa_distribution)
start = time.time()
peptides = []
lengths = range(args.length_range[0], args.length_range[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(
args.num_peptides_per_length,
length,
distribution=aa_distribution))
print("Done generating peptides in %0.2f sec." % (time.time() - start))
if args.predictor_kind == "class1_affinity":
return run_class1_affinity_predictor(args, peptides)
elif args.predictor_kind == "class1_presentation":
return run_class1_presentation_predictor(args, peptides)
else:
raise ValueError("Unsupported kind %s" % args.predictor_kind)
def run_class1_presentation_predictor(args, peptides):
# This will trigger a Keras import - will break local parallelism.
predictor = Class1PresentationPredictor.load(args.models_dir)
if args.allele:
alleles = [normalize_allele_name(a) for a in args.allele]
elif args.alleles_file:
alleles = pandas.read_csv(args.alleles_file).allele.unique()
else:
alleles = predictor.supported_alleles
print("Num alleles", len(alleles))
genotypes = {}
if args.alleles_per_genotype == 6:
gene_to_alleles = collections.defaultdict(list)
for a in alleles:
for gene in ["A", "B", "C"]:
if a.startswith("HLA-%s" % gene):
gene_to_alleles[gene].append(a)
for _ in range(args.num_genotypes):
genotype = []
for gene in ["A", "A", "B", "B", "C", "C"]:
genotype.append(numpy.random.choice(gene_to_alleles[gene]))
genotypes[",".join(genotype)] = genotype
elif args.alleles_per_genotype == 1:
for _ in range(args.num_genotypes):
genotype = [numpy.random.choice(alleles)]
genotypes[",".join(genotype)] = genotype
else:
raise ValueError("Alleles per genotype must be 6 or 1")
print("Sampled genotypes: ", list(genotypes))
print("Num peptides: ", len(peptides))
start = time.time()
print("Generating predictions")
predictions_df = predictor.predict(
peptides=peptides,
alleles=genotypes)
print("Finished in %0.2f sec." % (time.time() - start))
print(predictions_df)
print("Calibrating ranks")
scores = predictions_df.presentation_score.values
predictor.calibrate_percentile_ranks(scores)
print("Done. Saving.")
predictor.save(
args.models_dir,
write_affinity_predictor=False,
write_processing_predictor=False,
write_weights=False,
write_percent_ranks=True,
write_info=False,
write_metdata=False)
print("Wrote predictor to: %s" % args.models_dir)
def run_class1_affinity_predictor(args, peptides):
global GLOBAL_DATA
# It's important that we don't trigger a Keras import here since that breaks
# local parallelism (tensorflow backend). So we set optimization_level=0.
predictor = Class1AffinityPredictor.load(
args.models_dir,
optimization_level=0,
)
if args.allele:
alleles = [normalize_allele_name(a) for a in args.allele]
elif args.alleles_file:
alleles = pandas.read_csv(args.alleles_file).allele.unique()
else:
alleles = predictor.supported_alleles
allele_set = set(alleles)
if predictor.allele_to_sequence:
# Remove alleles that have the same sequence.
new_allele_set = set()
sequence_to_allele = collections.defaultdict(set)
for allele in list(allele_set):
sequence_to_allele[predictor.allele_to_sequence[allele]].add(allele)
for equivalent_alleles in sequence_to_allele.values():
equivalent_alleles = sorted(equivalent_alleles)
keep = equivalent_alleles.pop(0)
new_allele_set.add(keep)
print(
"Sequence comparison reduced num alleles from",
len(allele_set),
"to",
len(new_allele_set))
allele_set = new_allele_set
alleles = sorted(allele_set)
print("Percent rank calibration for %d alleles. " % (len(alleles)))
print("Encoding %d peptides." % len(peptides))
start = time.time()
encoded_peptides = EncodableSequences.create(peptides)
del peptides
# Now we encode the peptides for each neural network, so the encoding
# becomes cached.
for network in predictor.neural_networks:
network.peptides_to_network_input(encoded_peptides)
assert encoded_peptides.encoding_cache # must have cached the encoding
print("Finished encoding peptides in %0.2f sec." % (time.time() - start))
# Store peptides in global variable so they are in shared memory
# after fork, instead of needing to be pickled (when doing a parallel run).
GLOBAL_DATA["calibration_peptides"] = encoded_peptides
GLOBAL_DATA["predictor"] = predictor
GLOBAL_DATA["args"] = {
'motif_summary': args.motif_summary,
'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
'verbose': args.verbosity > 0,
'model_kwargs': {
'batch_size': args.prediction_batch_size,
}
}
del encoded_peptides
serial_run = not args.cluster_parallelism and args.num_jobs == 0
worker_pool = None
start = time.time()
work_items = []
for allele in alleles:
if not work_items or len(
work_items[-1]['alleles']) >= args.alleles_per_work_chunk:
work_items.append({"alleles": []})
work_items[-1]['alleles'].append(allele)
if serial_run:
# Serial run
print("Running in serial.")
results = (
do_class1_affinity_calibrate_percentile_ranks(**item) for item in work_items)
elif args.cluster_parallelism:
# Run using separate processes HPC cluster.
print("Running on cluster.")
results = cluster_results_from_args(
args,
work_function=do_class1_affinity_calibrate_percentile_ranks,
work_items=work_items,
constant_data=GLOBAL_DATA,
result_serialization_method="pickle",
clear_constant_data=True)
else:
worker_pool = worker_pool_with_gpu_assignments_from_args(args)
print("Worker pool", worker_pool)
assert worker_pool is not None
for item in work_items:
item['constant_data'] = GLOBAL_DATA
results = worker_pool.imap_unordered(
partial(call_wrapped_kwargs, do_class1_affinity_calibrate_percentile_ranks),
work_items,
chunksize=1)
summary_results_lists = collections.defaultdict(list)
for work_item in tqdm.tqdm(results, total=len(work_items)):
for (transforms, summary_results) in work_item:
predictor.allele_to_percent_rank_transform.update(transforms)
if summary_results is not None:
for (item, value) in summary_results.items():
summary_results_lists[item].append(value)
print("Done calibrating %d alleles." % len(alleles))
if summary_results_lists:
for (name, lst) in summary_results_lists.items():
df = pandas.concat(lst, ignore_index=True)
predictor.metadata_dataframes[name] = df
print("Including summary result: %s" % name)
print(df)
predictor.save(args.models_dir, model_names_to_write=[])
percent_rank_calibration_time = time.time() - start
if worker_pool:
worker_pool.close()
worker_pool.join()
print("Percent rank calibration time: %0.2f min." % (
percent_rank_calibration_time / 60.0))
print("Predictor written to: %s" % args.models_dir)
def do_class1_affinity_calibrate_percentile_ranks(
alleles, constant_data=GLOBAL_DATA):
if 'predictor' not in constant_data:
raise ValueError("No predictor provided: " + str(constant_data))
result_list = []
for (i, allele) in enumerate(alleles):
print("Processing allele", i + 1, "of", len(alleles))
result_item = class1_affinity_calibrate_percentile_ranks(
allele,
constant_data['predictor'],
peptides=constant_data['calibration_peptides'],
**constant_data["args"])
result_list.append(result_item)
return result_list
def class1_affinity_calibrate_percentile_ranks(
allele,
predictor,
peptides=None,
motif_summary=False,
summary_top_peptide_fractions=[0.001],
verbose=False,
model_kwargs={}):
if verbose:
print("Calibrating", allele)
predictor.optimize() # since we loaded with optimization_level=0
start = time.time()
summary_results = predictor.calibrate_percentile_ranks(
peptides=peptides,
alleles=[allele],
motif_summary=motif_summary,
summary_top_peptide_fractions=summary_top_peptide_fractions,
verbose=verbose,
model_kwargs=model_kwargs)
if verbose:
print("Done calibrating", allele, "in", time.time() - start, "sec")
transforms = {
allele: predictor.allele_to_percent_rank_transform[allele],
}
return (transforms, summary_results)
if __name__ == '__main__':
run()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
"""
Layer-sequential unit-variance initialization for neural networks.
See:
Mishkin and Matas, "All you need is a good init". 2016.
https://arxiv.org/abs/1511.06422
"""
#
# LSUV initialization code in this file is adapted from:
# https://github.com/ducha-aiki/LSUV-keras/blob/master/lsuv_init.py
# by Dmytro Mishkin
#
# Here is the license for the original code:
#
#
# Copyright (C) 2017, Dmytro Mishkin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the
# distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import print_function
import numpy
from .common import configure_tensorflow
def svd_orthonormal(shape):
# Orthonormal init code is from Lasagne
# https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py
if len(shape) < 2:
raise RuntimeError("Only shapes of length 2 or more are supported.")
flat_shape = (shape[0], numpy.prod(shape[1:]))
a = numpy.random.standard_normal(flat_shape).astype("float32")
u, _, v = numpy.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v
q = q.reshape(shape)
return q
def get_activations(model, layer, X_batch):
configure_tensorflow()
from tensorflow.keras.models import Model
intermediate_layer_model = Model(
inputs=model.get_input_at(0),
outputs=layer.get_output_at(0)
)
activations = intermediate_layer_model.predict(X_batch)
return activations
def lsuv_init(model, batch, verbose=True, margin=0.1, max_iter=100):
"""
Initialize neural network weights using layer-sequential unit-variance
initialization.
See:
Mishkin and Matas, "All you need is a good init". 2016.
https://arxiv.org/abs/1511.06422
Parameters
----------
model : keras.Model
batch : dict
Training data, as would be passed keras.Model.fit()
verbose : boolean
Whether to print progress to stdout
margin : float
max_iter : int
Returns
-------
keras.Model
Same as what was passed in.
"""
configure_tensorflow()
from tensorflow.keras.layers import Dense, Convolution2D
needed_variance = 1.0
layers_inintialized = 0
for layer in model.layers:
if not isinstance(layer, (Dense, Convolution2D)):
continue
# avoid small layers where activation variance close to zero, esp.
# for small batches_generator
if numpy.prod(layer.get_output_shape_at(0)[1:]) < 32:
if verbose:
print('LSUV initialization skipping', layer.name)
continue
layers_inintialized += 1
weights_and_biases = layer.get_weights()
weights_and_biases[0] = svd_orthonormal(weights_and_biases[0].shape)
layer.set_weights(weights_and_biases)
activations = get_activations(model, layer, batch)
variance = numpy.var(activations)
iteration = 0
if verbose:
print(layer.name, variance)
while abs(needed_variance - variance) > margin:
if verbose:
print(
'LSUV initialization',
layer.name,
iteration,
needed_variance,
margin,
variance)
if numpy.abs(numpy.sqrt(variance)) < 1e-7:
break # avoid zero division
weights_and_biases = layer.get_weights()
weights_and_biases[0] /= numpy.sqrt(variance) / numpy.sqrt(
needed_variance)
layer.set_weights(weights_and_biases)
activations = get_activations(model, layer, batch)
variance = numpy.var(activations)
iteration += 1
if iteration >= max_iter:
break
if verbose:
print('Done with LSUV: total layers initialized', layers_inintialized)
return model
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
"""
Measures of centrality (e.g. mean) used to combine predictions across an
ensemble. The input to these functions are log affinities, and they are expected
to return a centrality measure also in log-space.
"""
import numpy
from functools import partial
def robust_mean(log_values):
"""
Mean of values falling within the 25-75 percentiles.
Parameters
----------
log_values : 2-d numpy.array
Center is computed along the second axis (i.e. per row).
Returns
-------
center : numpy.array of length log_values.shape[1]
"""
if log_values.shape[1] <= 3:
# Too few values to use robust mean.
return numpy.nanmean(log_values, axis=1)
without_nans = numpy.nan_to_num(log_values) # replace nan with 0
mask = (
(~numpy.isnan(log_values)) &
(without_nans <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) &
(without_nans >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1))))
return (without_nans * mask.astype(float)).sum(1) / mask.sum(1)
CENTRALITY_MEASURES = {
"mean": partial(numpy.nanmean, axis=1),
"median": partial(numpy.nanmedian, axis=1),
"robust_mean": robust_mean,
}
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment