Skip to content
Snippets Groups Projects
Unverified Commit 18d0bd99 authored by Tim O'Donnell's avatar Tim O'Donnell Committed by GitHub
Browse files

Merge pull request #145 from openvax/v1.3

pan allele prediction (MHCflurry 1.3.0)
parents 74b751e6 3cbdfd69
No related branches found
No related tags found
No related merge requests found
Showing
with 2642 additions and 522 deletions
"""
Generate grid of hyperparameters
"""
from sys import stdout
from copy import deepcopy
from yaml import dump
base_hyperparameters = {
'activation': 'tanh',
'allele_dense_layer_sizes': [],
'batch_normalization': False,
'dense_layer_l1_regularization': 0.0,
'dense_layer_l2_regularization': 0.0,
'dropout_probability': 0.5,
'early_stopping': True,
'init': 'glorot_uniform',
'layer_sizes': [1024, 512],
'learning_rate': 0.001,
'locally_connected_layers': [],
'loss': 'custom:mse_with_inequalities',
'max_epochs': 5000,
'minibatch_size': 128,
'optimizer': 'rmsprop',
'output_activation': 'sigmoid',
"patience": 20,
"min_delta": 0.0,
'peptide_encoding': {
'vector_encoding_name': 'BLOSUM62',
'alignment_method': 'left_pad_centered_right_pad',
'max_length': 15,
},
'peptide_allele_merge_activation': '',
'peptide_allele_merge_method': 'concatenate',
'peptide_amino_acid_encoding': 'BLOSUM62',
'peptide_dense_layer_sizes': [],
'random_negative_affinity_max': 50000.0,
'random_negative_affinity_min': 20000.0,
'random_negative_constant': 1500,
'random_negative_distribution_smoothing': 0.0,
'random_negative_match_distribution': True,
'random_negative_rate': 0.2,
'train_data': {
'pretrain': True,
'pretrain_peptides_per_epoch': 64,
'pretrain_steps_per_epoch': 256,
'pretrain_patience': 2,
'pretrain_min_delta': 0.0001,
'pretrain_max_val_loss': 0.10,
'pretrain_max_epochs': 50,
'pretrain_min_epochs': 5,
},
'validation_split': 0.1,
'data_dependent_initialization_method': "lsuv",
}
grid = []
for layer_sizes in [[512, 256], [512, 512], [1024, 512], [1024, 1024]]:
for pretrain in [True]:
l1_base = 0.0000001
for l1 in [l1_base, l1_base / 10, l1_base / 100, l1_base / 1000, 0.0]:
for lr in [0.001, 0.01]:
new = deepcopy(base_hyperparameters)
new["layer_sizes"] = layer_sizes
new["dense_layer_l1_regularization"] = l1
new["train_data"]["pretrain"] = pretrain
new["learning_rate"] = lr
if not grid or new not in grid:
grid.append(new)
dump(grid, stdout)
#!/bin/bash
#
# Generate predictions for random peptides. Used for pre-training some models.
#
set -e
set -x
DOWNLOAD_NAME=random_peptide_predictions
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/random_predictions.py .
cp $SCRIPT_ABSOLUTE_PATH .
time python random_predictions.py \
--num-peptides 5000000 \
--models "$(mhcflurry-downloads path models_class1_selected_no_mass_spec)/models" \
--out predictions.csv
bzip2 predictions.csv
bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
"""
Generate predictions for random peptides.
"""
from __future__ import print_function
import sys
import argparse
import time
import math
import pandas
import mhcflurry
from mhcflurry.common import random_peptides
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument("--models", required=True)
parser.add_argument("--num-peptides", type=int)
parser.add_argument("--out", required=True)
parser.add_argument("--chunksize", type=int, default=10000)
def run():
args = parser.parse_args(sys.argv[1:])
print(args)
predictor = mhcflurry.Class1AffinityPredictor.load(args.models)
alleles = pandas.Series(predictor.supported_alleles)
# Clear the file
pandas.DataFrame(columns=alleles).to_csv(args.out, index=True)
(min_length, max_length) = predictor.supported_peptide_lengths
peptides_per_length = int(
math.ceil(args.chunksize / (max_length - min_length)))
peptides_written = 0
i = 0
while peptides_written < args.num_peptides:
print("Chunk %d / %d" % (
i + 1, math.ceil(args.num_peptides / args.chunksize)))
start = time.time()
peptides = []
for l in range(8, 16):
peptides.extend(random_peptides(peptides_per_length, length=l))
peptides = pandas.Series(peptides).sample(
n=min(args.chunksize, args.num_peptides - peptides_written)).values
encodable_peptides = mhcflurry.encodable_sequences.EncodableSequences.create(
peptides)
df = pandas.DataFrame(index=peptides)
for allele in alleles:
df[allele] = predictor.predict(encodable_peptides, allele=allele)
df.to_csv(
args.out, index=True, mode='a', header=False, float_format='%.1f')
print("Wrote: %s [%0.2f sec]" % (args.out, time.time() - start))
i += 1
peptides_written += len(peptides)
print("Done.")
if __name__ == '__main__':
run()
"""
Class I MHC ligand prediction package
"""
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_neural_network import Class1NeuralNetwork
from .version import __version__
......
import numpy
import pandas
from .encodable_sequences import EncodableSequences
from . import amino_acid
class AlleleEncoding(object):
def __init__(
self,
alleles,
allele_to_fixed_length_sequence=None):
def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None):
"""
A place to cache encodings for a (potentially large) sequence of alleles.
A place to cache encodings for a sequence of alleles.
We frequently work with alleles by integer indices, for example as
inputs to neural networks. This class is used to map allele names to
integer indices in a consistent way by keeping track of the universe
of alleles under use, i.e. a distinction is made between the universe
of supported alleles (what's in `allele_to_sequence`) and the actual
set of alleles used (what's in `alleles`).
Parameters
----------
alleles : list of string
Allele names
allele_to_fixed_length_sequence : dict of str -> str
Allele name to fixed lengths sequence ("pseudosequence")
allele_to_sequence : dict of str -> str
Allele name to amino acid sequence
borrow_from : AlleleEncoding, optional
If specified, do not specify allele_to_sequence. The sequences from
the provided instance are used. This guarantees that the mappings
from allele to index and from allele to sequence are the same
between the instances.
"""
alleles = pandas.Series(alleles)
if alleles is not None:
alleles = pandas.Series(alleles)
self.borrow_from = borrow_from
self.allele_to_sequence = allele_to_sequence
all_alleles = list(sorted(alleles.unique()))
if self.borrow_from is None:
assert allele_to_sequence is not None
all_alleles = (
sorted(allele_to_sequence))
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
unpadded = pandas.Series(
[allele_to_sequence[a] for a in all_alleles],
index=all_alleles)
self.sequences = unpadded.str.pad(
unpadded.str.len().max(), fillchar="X")
else:
assert allele_to_sequence is None
self.allele_to_index = borrow_from.allele_to_index
self.sequences = borrow_from.sequences
self.allele_to_sequence = borrow_from.allele_to_sequence
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
if alleles is not None:
assert all(
allele in self.allele_to_index for allele in alleles),\
"Missing alleles: " + " ".join(set(
a for a in alleles if a not in self.allele_to_index))
self.indices = alleles.map(self.allele_to_index)
assert not self.indices.isnull().any()
self.alleles = alleles
else:
self.indices = None
self.alleles = None
self.indices = alleles.map(self.allele_to_index)
self.encoding_cache = {}
self.fixed_length_sequences = pandas.Series(
[allele_to_fixed_length_sequence[a] for a in all_alleles],
index=all_alleles)
def compact(self):
"""
Return a new AlleleEncoding in which the universe of supported alleles
is only the alleles actually used.
self.encoding_cache = {}
Returns
-------
AlleleEncoding
"""
return AlleleEncoding(
alleles=self.alleles,
allele_to_sequence=dict(
(allele, self.allele_to_sequence[allele])
for allele in self.alleles.unique()))
def fixed_length_vector_encoded_sequences(self, vector_encoding_name):
def allele_representations(self, encoding_name):
"""
Encode alleles.
Encode the universe of supported allele sequences to a matrix.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings() in amino_acid.
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
Returns
-------
numpy.array with shape (num sequences, sequence length, m) where m is
vector_encoding_length(vector_encoding_name)
numpy.array of shape
(num alleles in universe, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
if self.borrow_from is not None:
return self.borrow_from.allele_representations(encoding_name)
cache_key = (
"fixed_length_vector_encoding",
vector_encoding_name)
"allele_representations",
encoding_name)
if cache_key not in self.encoding_cache:
index_encoded_matrix = amino_acid.index_encoding(
self.fixed_length_sequences.values,
self.sequences.values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
result = vector_encoded[self.indices]
self.encoding_cache[cache_key] = result
amino_acid.ENCODING_DATA_FRAMES[encoding_name])
self.encoding_cache[cache_key] = vector_encoded
return self.encoding_cache[cache_key]
def fixed_length_vector_encoded_sequences(self, encoding_name):
"""
Encode allele sequences (not the universe of alleles) to a matrix.
Parameters
----------
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
Returns
-------
numpy.array with shape:
(num alleles, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
cache_key = (
"fixed_length_vector_encoding",
encoding_name)
if cache_key not in self.encoding_cache:
vector_encoded = self.allele_representations(encoding_name)
result = vector_encoded[self.indices]
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
......@@ -68,7 +68,7 @@ W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS].astype("int8")
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
ENCODING_DATA_FRAMES = {
......@@ -153,7 +153,7 @@ def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
target_shape = (
num_sequences, sequence_length, letter_to_vector_df.shape[0])
result = letter_to_vector_df.iloc[
index_encoded_sequences.flatten()
index_encoded_sequences.reshape((-1,)) # reshape() avoids copy
].values.reshape(target_shape)
return result
......@@ -7,23 +7,25 @@ import signal
import sys
import time
import traceback
import random
import collections
from functools import partial
import numpy
import pandas
import yaml
from sklearn.metrics.pairwise import cosine_similarity
from mhcnames import normalize_allele_name
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .common import configure_logging
from .parallelism import (
add_worker_pool_args,
from .encodable_sequences import EncodableSequences
from .common import configure_logging, random_peptides, amino_acid_distribution
from .local_parallelism import (
add_local_parallelism_args,
worker_pool_with_gpu_assignments_from_args,
call_wrapped)
call_wrapped_kwargs)
from .cluster_parallelism import (
add_cluster_parallelism_args,
cluster_results_from_args)
# To avoid pickling large matrices to send to child processes when running in
......@@ -44,8 +46,13 @@ parser.add_argument(
"--allele",
default=None,
nargs="+",
help="Alleles to train models for. If not specified, all alleles with "
"enough measurements will be used.")
help="Alleles to calibrate percentile ranks for. If not specified all "
"alleles are used")
parser.add_argument(
"--match-amino-acid-distribution-data",
help="Sample random peptides from the amino acid distribution of the "
"peptides listed in the supplied CSV file, which must have a 'peptide' "
"column. If not specified a uniform distribution is used.")
parser.add_argument(
"--num-peptides-per-length",
type=int,
......@@ -53,13 +60,40 @@ parser.add_argument(
default=int(1e5),
help="Number of peptides per length to use to calibrate percent ranks. "
"Default: %(default)s.")
parser.add_argument(
"--motif-summary",
default=False,
action="store_true",
help="Calculate motifs and length preferences for each allele")
parser.add_argument(
"--summary-top-peptide-fraction",
default=[0.0001, 0.001, 0.01, 0.1, 1.0],
nargs="+",
type=float,
metavar="X",
help="The top X fraction of predictions (i.e. tightest binders) to use to "
"generate motifs and length preferences. Default: %(default)s")
parser.add_argument(
"--length-range",
default=(8, 15),
type=int,
nargs=2,
help="Min and max peptide length to calibrate, inclusive. "
"Default: %(default)s")
parser.add_argument(
"--prediction-batch-size",
type=int,
default=4096,
help="Keras batch size for predictions")
parser.add_argument(
"--verbosity",
type=int,
help="Keras verbosity. Default: %(default)s",
default=0)
add_worker_pool_args(parser)
add_local_parallelism_args(parser)
add_cluster_parallelism_args(parser)
def run(argv=sys.argv[1:]):
global GLOBAL_DATA
......@@ -81,49 +115,94 @@ def run(argv=sys.argv[1:]):
else:
alleles = predictor.supported_alleles
distribution = None
if args.match_amino_acid_distribution_data:
distribution_peptides = pandas.read_csv(
args.match_amino_acid_distribution_data).peptide.unique()
distribution = amino_acid_distribution(distribution_peptides)
print("Using amino acid distribution:")
print(distribution)
start = time.time()
print("Percent rank calibration for %d alleles. Generating peptides." % (
len(alleles)))
peptides = []
lengths = range(args.length_range[0], args.length_range[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(
args.num_peptides_per_length, length, distribution=distribution))
print("Done generating peptides in %0.2f sec." % (time.time() - start))
print("Encoding %d peptides." % len(peptides))
start = time.time()
print("Performing percent rank calibration. Encoding peptides.")
encoded_peptides = predictor.calibrate_percentile_ranks(
alleles=[], # don't actually do any calibration, just return peptides
num_peptides_per_length=args.num_peptides_per_length)
encoded_peptides = EncodableSequences.create(peptides)
del peptides
# Now we encode the peptides for each neural network, so the encoding
# becomes cached.
for network in predictor.neural_networks:
network.peptides_to_network_input(encoded_peptides)
assert encoded_peptides.encoding_cache # must have cached the encoding
print("Finished encoding peptides for percent ranks in %0.2f sec." % (
time.time() - start))
print("Calibrating percent rank calibration for %d alleles." % len(alleles))
print("Finished encoding peptides in %0.2f sec." % (time.time() - start))
# Store peptides in global variable so they are in shared memory
# after fork, instead of needing to be pickled (when doing a parallel run).
GLOBAL_DATA["calibration_peptides"] = encoded_peptides
GLOBAL_DATA["predictor"] = predictor
GLOBAL_DATA["args"] = {
'motif_summary': args.motif_summary,
'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
'verbose': args.verbosity > 0,
'model_kwargs': {
'batch_size': args.prediction_batch_size,
}
}
del encoded_peptides
worker_pool = worker_pool_with_gpu_assignments_from_args(args)
if worker_pool is None:
serial_run = not args.cluster_parallelism and args.num_jobs == 0
worker_pool = None
start = time.time()
work_items = [{"allele": allele} for allele in alleles]
if serial_run:
# Serial run
print("Running in serial.")
results = (
calibrate_percentile_ranks(
allele=allele,
predictor=predictor,
peptides=encoded_peptides)
for allele in alleles)
do_calibrate_percentile_ranks(**item) for item in work_items)
elif args.cluster_parallelism:
# Run using separate processes HPC cluster.
print("Running on cluster.")
results = cluster_results_from_args(
args,
work_function=do_calibrate_percentile_ranks,
work_items=work_items,
constant_data=GLOBAL_DATA,
result_serialization_method="pickle",
clear_constant_data=True)
else:
# Parallel run
worker_pool = worker_pool_with_gpu_assignments_from_args(args)
print("Worker pool", worker_pool)
assert worker_pool is not None
results = worker_pool.imap_unordered(
partial(
partial(call_wrapped, calibrate_percentile_ranks),
predictor=predictor),
alleles,
partial(call_wrapped_kwargs, do_calibrate_percentile_ranks),
work_items,
chunksize=1)
for result in tqdm.tqdm(results, total=len(alleles)):
predictor.allele_to_percent_rank_transform.update(result)
print("Done calibrating %d additional alleles." % len(alleles))
summary_results_lists = collections.defaultdict(list)
for (transforms, summary_results) in tqdm.tqdm(results, total=len(work_items)):
predictor.allele_to_percent_rank_transform.update(transforms)
if summary_results is not None:
for (item, value) in summary_results.items():
summary_results_lists[item].append(value)
print("Done calibrating %d alleles." % len(work_items))
if summary_results_lists:
for (name, lst) in summary_results_lists.items():
df = pandas.concat(lst, ignore_index=True)
predictor.metadata_dataframes[name] = df
print("Including summary result: %s" % name)
print(df)
predictor.save(args.models_dir, model_names_to_write=[])
percent_rank_calibration_time = time.time() - start
......@@ -133,23 +212,42 @@ def run(argv=sys.argv[1:]):
worker_pool.join()
print("Percent rank calibration time: %0.2f min." % (
percent_rank_calibration_time / 60.0))
percent_rank_calibration_time / 60.0))
print("Predictor written to: %s" % args.models_dir)
def calibrate_percentile_ranks(allele, predictor, peptides=None):
"""
Private helper function.
"""
global GLOBAL_DATA
if peptides is None:
peptides = GLOBAL_DATA["calibration_peptides"]
predictor.calibrate_percentile_ranks(
def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA):
return calibrate_percentile_ranks(
allele,
constant_data['predictor'],
peptides=constant_data['calibration_peptides'],
**constant_data["args"])
def calibrate_percentile_ranks(
allele,
predictor,
peptides=None,
motif_summary=False,
summary_top_peptide_fractions=[0.001],
verbose=False,
model_kwargs={}):
if verbose:
print("Calibrating", allele)
start = time.time()
summary_results = predictor.calibrate_percentile_ranks(
peptides=peptides,
alleles=[allele])
return {
alleles=[allele],
motif_summary=motif_summary,
summary_top_peptide_fractions=summary_top_peptide_fractions,
verbose=verbose,
model_kwargs=model_kwargs)
if verbose:
print("Done calibrating", allele, "in", time.time() - start, "sec")
transforms = {
allele: predictor.allele_to_percent_rank_transform[allele],
}
return (transforms, summary_results)
if __name__ == '__main__':
......
This diff is collapsed.
This diff is collapsed.
"""
Simple, relatively naive parallel map implementation for HPC clusters.
Used for training MHCflurry models.
"""
import traceback
import sys
import os
import time
import signal
import argparse
import pickle
import subprocess
import shutil
from .local_parallelism import call_wrapped_kwargs
from .class1_affinity_predictor import Class1AffinityPredictor
try:
from shlex import quote
except ImportError:
from pipes import quote
def add_cluster_parallelism_args(parser):
"""
Add commandline arguments controlling cluster parallelism to an argparse
ArgumentParser.
Parameters
----------
parser : argparse.ArgumentParser
"""
group = parser.add_argument_group("Cluster parallelism")
group.add_argument(
"--cluster-parallelism",
default=False,
action="store_true")
group.add_argument(
"--cluster-submit-command",
default='sh',
help="Default: %(default)s")
group.add_argument(
"--cluster-results-workdir",
default='./cluster-workdir',
help="Default: %(default)s")
group.add_argument(
'--cluster-script-prefix-path',
help="",
)
group.add_argument('--cluster-max-retries', help="", default=3)
def cluster_results_from_args(
args,
work_function,
work_items,
constant_data=None,
result_serialization_method="pickle",
clear_constant_data=False):
"""
Parallel map configurable using commandline arguments. See the
cluster_results() function for docs.
The `args` parameter should be an argparse.Namespace from an argparse parser
generated using the add_cluster_parallelism_args() function.
Parameters
----------
args
work_function
work_items
constant_data
result_serialization_method
clear_constant_data
Returns
-------
generator
"""
return cluster_results(
work_function=work_function,
work_items=work_items,
constant_data=constant_data,
submit_command=args.cluster_submit_command,
results_workdir=args.cluster_results_workdir,
script_prefix_path=args.cluster_script_prefix_path,
result_serialization_method=result_serialization_method,
clear_constant_data=clear_constant_data
)
def cluster_results(
work_function,
work_items,
constant_data=None,
submit_command="sh",
results_workdir="./cluster-workdir",
script_prefix_path=None,
result_serialization_method="pickle",
max_retries=3,
clear_constant_data=False):
"""
Parallel map on an HPC cluster.
Returns [work_function(item) for item in work_items] where each invocation
of work_function is performed as a separate HPC cluster job. Order is
preserved.
Optionally, "constant data" can be specified, which will be passed to
each work_function() invocation as a keyword argument called constant_data.
This data is serialized once and all workers read it from the same source,
which is more efficient than serializing it separately for each worker.
Each worker's input is serialized to a shared NFS directory and the
submit_command is used to launch a job to process that input. The shared
filesystem is polled occasionally to watch for results, which are fed back
to the user.
Parameters
----------
work_function : A -> B
work_items : list of A
constant_data : object
submit_command : string
For running on LSF, we use "bsub" here.
results_workdir : string
Path to NFS shared directory where inputs and results can be written
script_prefix_path : string
Path to script that will be invoked to run each worker. A line calling
the _mhcflurry-cluster-worker-entry-point command will be appended to
the contents of this file.
result_serialization_method : string, one of "pickle" or "save_predictor"
The "save_predictor" works only when the return type of work_function
is Class1AffinityPredictor
max_retries : int
How many times to attempt to re-launch a failed worker
clear_constant_data : bool
If True, the constant data dict is cleared on the launching host after
it is serialized to disk.
Returns
-------
generator of B
"""
constant_payload = {
'constant_data': constant_data,
'function': work_function,
}
work_dir = os.path.join(
os.path.abspath(results_workdir),
str(int(time.time())))
os.mkdir(work_dir)
constant_payload_path = os.path.join(work_dir, "global_data.pkl")
with open(constant_payload_path, "wb") as fd:
pickle.dump(constant_payload, fd, protocol=pickle.HIGHEST_PROTOCOL)
print("Wrote:", constant_payload_path)
if clear_constant_data:
constant_data.clear()
print("Cleared constant data to free up memory.")
if script_prefix_path:
with open(script_prefix_path) as fd:
script_prefix = fd.read()
else:
script_prefix = "#!/bin/bash"
result_items = []
for (i, item) in enumerate(work_items):
item_workdir = os.path.join(
work_dir, "work-item.%03d-of-%03d" % (i, len(work_items)))
os.mkdir(item_workdir)
item_data_path = os.path.join(item_workdir, "data.pkl")
with open(item_data_path, "wb") as fd:
pickle.dump(item, fd, protocol=pickle.HIGHEST_PROTOCOL)
print("Wrote:", item_data_path)
item_result_path = os.path.join(item_workdir, "result")
item_error_path = os.path.join(item_workdir, "error.pkl")
item_finished_path = os.path.join(item_workdir, "COMPLETE")
item_script_pieces = [
script_prefix.format(work_item_num=i, work_dir=item_workdir)
]
item_script_pieces.append(" ".join([
"_mhcflurry-cluster-worker-entry-point",
"--constant-data", quote(constant_payload_path),
"--worker-data", quote(item_data_path),
"--result-out", quote(item_result_path),
"--error-out", quote(item_error_path),
"--complete-dir", quote(item_finished_path),
"--result-serialization-method", result_serialization_method,
]))
item_script = "\n".join(item_script_pieces)
item_script_path = os.path.join(
item_workdir,
"run.%d.sh" % i)
with open(item_script_path, "w") as fd:
fd.write(item_script)
print("Wrote:", item_script_path)
launch_command = " ".join([
submit_command, "<", quote(item_script_path)
])
subprocess.check_call(launch_command, shell=True)
print("Invoked", launch_command)
result_items.append({
'work_dir': item_workdir,
'finished_path': item_finished_path,
'result_path': item_result_path,
'error_path': item_error_path,
'retry_num': 0,
'launch_command': launch_command,
})
def result_generator():
start = time.time()
while result_items:
print("[%0.1f sec elapsed] waiting on %d / %d items." % (
time.time() - start, len(result_items), len(work_items)))
while True:
result_item = None
for d in result_items:
if os.path.exists(d['finished_path']):
result_item = d
break
if result_item is None:
time.sleep(60)
else:
result_items.remove(result_item)
break
complete_dir = result_item['finished_path']
result_path = result_item['result_path']
error_path = result_item['error_path']
retry_num = result_item['retry_num']
launch_command = result_item['launch_command']
print("[%0.1f sec elapsed] processing item %s" % (
time.time() - start, result_item))
if os.path.exists(error_path):
print("Error path exists", error_path)
with open(error_path, "rb") as fd:
exception = pickle.load(fd)
print(exception)
if retry_num < max_retries:
print("Relaunching", launch_command)
attempt_dir = os.path.join(
result_item['work_dir'], "attempt.%d" % retry_num)
shutil.move(complete_dir, attempt_dir)
shutil.move(error_path, attempt_dir)
subprocess.check_call(launch_command, shell=True)
print("Invoked", launch_command)
result_item['retry_num'] += 1
result_items.append(result_item)
continue
else:
print("Max retries exceeded", max_retries)
raise exception
if os.path.exists(result_path):
print("Result path exists", result_path)
if result_serialization_method == "save_predictor":
result = Class1AffinityPredictor.load(result_path)
else:
assert result_serialization_method == "pickle"
with open(result_path, "rb") as fd:
result = pickle.load(fd)
yield result
else:
raise RuntimeError("Results do not exist", result_path)
return result_generator()
parser = argparse.ArgumentParser(
usage="Entry point for cluster workers")
parser.add_argument(
"--constant-data",
required=True,
)
parser.add_argument(
"--worker-data",
required=True,
)
parser.add_argument(
"--result-out",
required=True,
)
parser.add_argument(
"--error-out",
required=True,
)
parser.add_argument(
"--complete-dir",
)
parser.add_argument(
"--result-serialization-method",
choices=("pickle", "save_predictor"),
default="pickle")
def worker_entry_point(argv=sys.argv[1:]):
"""
Entry point for the worker command.
Parameters
----------
argv : list of string
"""
# On sigusr1 print stack trace
print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
args = parser.parse_args(argv)
with open(args.constant_data, "rb") as fd:
constant_payload = pickle.load(fd)
with open(args.worker_data, "rb") as fd:
worker_data = pickle.load(fd)
kwargs = dict(worker_data)
if constant_payload['constant_data'] is not None:
kwargs['constant_data'] = constant_payload['constant_data']
try:
result = call_wrapped_kwargs(constant_payload['function'], kwargs)
if args.result_serialization_method == 'save_predictor':
result.save(args.result_out)
else:
with open(args.result_out, "wb") as fd:
pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
print("Wrote:", args.result_out)
except Exception as e:
print("Exception: ", e)
with open(args.error_out, "wb") as fd:
pickle.dump(e, fd, pickle.HIGHEST_PROTOCOL)
print("Wrote:", args.error_out)
raise
finally:
if args.complete_dir:
os.mkdir(args.complete_dir)
print("Created: ", args.complete_dir)
......@@ -147,3 +147,30 @@ def random_peptides(num, length=9, distribution=None):
p=distribution.values,
size=(int(num), int(length)))
]
def positional_frequency_matrix(peptides):
"""
Given a set of peptides, calculate a length x amino acids frequency matrix.
Parameters
----------
peptides : list of string
All of same length
Returns
-------
pandas.DataFrame
Index is position, columns are amino acids
"""
length = len(peptides[0])
assert all(len(peptide) == length for peptide in peptides)
counts = pandas.DataFrame(
index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'],
columns=numpy.arange(1, length + 1),
)
for i in range(length):
counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
result = (counts / len(peptides)).fillna(0.0).T
result.index.name = 'position'
return result
......@@ -5,16 +5,78 @@ For losses supporting inequalities, each training data point is associated with
one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only
if the prediction is less than the given value.
"""
from __future__ import division
import pandas
import numpy
from numpy import isnan, array
CUSTOM_LOSSES = {}
class MSEWithInequalities(object):
def get_loss(name):
"""
Get a custom_loss.Loss instance by name.
Parameters
----------
name : string
Returns
-------
custom_loss.Loss
"""
if name.startswith("custom:"):
try:
custom_loss = CUSTOM_LOSSES[name.replace("custom:", "")]
except KeyError:
raise ValueError(
"No such custom loss: %s. Supported losses are: %s" % (
name,
", ".join([
"custom:" + loss_name for loss_name in CUSTOM_LOSSES
])))
return custom_loss
return StandardKerasLoss(name)
class Loss(object):
"""
Thin wrapper to keep track of neural network loss functions, which could
be custom or baked into Keras.
Each subclass or instance should define these properties/methods:
- name : string
- loss : string or function
This is what gets passed to keras.fit()
- encode_y : numpy.ndarray -> numpy.ndarray
Transformation to apply to regression target before fitting
"""
def __init__(self, name=None):
self.name = name if name else self.name # use name from class instance
def __str__(self):
return "<Loss: %s>" % self.name
class StandardKerasLoss(Loss):
"""
A loss function supported by Keras, such as MSE.
"""
Supports training a regressor on data that includes inequalities
supports_inequalities = False
supports_multiple_outputs = False
def __init__(self, loss_name="mse"):
self.loss = loss_name
Loss.__init__(self, loss_name)
@staticmethod
def encode_y(y):
return y
class MSEWithInequalities(Loss):
"""
Supports training a regression model on data that includes inequalities
(e.g. x < 100). Mean square error is used as the loss for elements with
an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss
for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise.
......@@ -26,29 +88,30 @@ class MSEWithInequalities(object):
y_true is interpreted as follows:
between 0 - 1
Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
Regular MSE loss is used. Penalty (y_pred - y_true)**2 is applied if
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
"""
name = "mse_with_inequalities"
supports_inequalities = True
supports_multiple_outputs = False
@staticmethod
def encode_y(y, inequalities=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN")
raise ValueError("y contains NaN", y)
if (y > 1.0).any():
raise ValueError("y contains values > 1.0")
raise ValueError("y contains values > 1.0", y)
if (y < 0.0).any():
raise ValueError("y contains values < 0.0")
raise ValueError("y contains values < 0.0", y)
if inequalities is None:
encoded = y
......@@ -66,8 +129,8 @@ class MSEWithInequalities(object):
@staticmethod
def loss(y_true, y_pred):
# We always delay import of Keras so that mhcflurry can be imported initially
# without tensorflow debug output, etc.
# We always delay import of Keras so that mhcflurry can be imported
# initially without tensorflow debug output, etc.
from keras import backend as K
# Handle (=) inequalities
......@@ -86,11 +149,106 @@ class MSEWithInequalities(object):
diff3 *= K.cast(y_true >= 4.0, "float32")
diff3 *= K.cast(diff3 > 0.0, "float32")
return (
K.sum(K.square(diff1), axis=-1) +
K.sum(K.square(diff2), axis=-1) +
K.sum(K.square(diff3), axis=-1))
result = (
K.sum(K.square(diff1)) +
K.sum(K.square(diff2)) +
K.sum(K.square(diff3))) / K.cast(K.shape(y_pred)[0], "float32")
return result
class MSEWithInequalitiesAndMultipleOutputs(Loss):
"""
Loss supporting inequalities and multiple outputs.
This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
hack, the implementation uses other intervals for y_pred to encode the
inequality and output-index information.
Inequalities are encoded into the regression target as in
the MSEWithInequalities loss.
Multiple outputs are encoded by mapping each regression target x (after
transforming for inequalities) using the rule x -> x + i * 10 where i is
the output index.
The reason for explicitly encoding multiple outputs this way (rather than
just making the regression target a matrix instead of a vector) is that
in our use cases we frequently have missing data in the regression target.
This encoding gives a simple way to penalize only on (data point, output
index) pairs that have labels.
"""
name = "mse_with_inequalities_and_multiple_outputs"
supports_inequalities = True
supports_multiple_outputs = True
@staticmethod
def encode_y(y, inequalities=None, output_indices=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN", y)
if (y > 1.0).any():
raise ValueError("y contains values > 1.0", y)
if (y < 0.0).any():
raise ValueError("y contains values < 0.0", y)
encoded = MSEWithInequalities.encode_y(
y, inequalities=inequalities)
if output_indices is not None:
output_indices = numpy.array(output_indices)
check_shape("output_indices", output_indices, (len(encoded),))
if (output_indices < 0).any():
raise ValueError("Invalid output indices: ", output_indices)
encoded += output_indices * 10
return encoded
@staticmethod
def loss(y_true, y_pred):
from keras import backend as K
y_true = K.flatten(y_true)
output_indices = y_true // 10
updated_y_true = y_true - (10 * output_indices)
# We index into y_pred using flattened indices since Keras backend
# supports gather but has no equivalent of tf.gather_nd:
ordinals = K.arange(K.shape(y_true)[0])
flattened_indices = (
ordinals * y_pred.shape[1] + K.cast(output_indices, "int32"))
updated_y_pred = K.gather(K.flatten(y_pred), flattened_indices)
# Alternative implementation using tensorflow, which could be used if
# we drop support for other backends:
# import tensorflow as tf
# indexer = K.stack([
# ordinals,
# K.cast(output_indices, "int32")
# ], axis=-1)
#updated_y_pred = tf.gather_nd(y_pred, indexer)
return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
def check_shape(name, arr, expected_shape):
"""
Raise ValueError if arr.shape != expected_shape.
Parameters
----------
name : string
Included in error message to aid debugging
arr : numpy.ndarray
expected_shape : tuple of int
"""
if arr.shape != expected_shape:
raise ValueError("Expected %s to have shape %s not %s" % (
name, str(expected_shape), str(arr.shape)))
# Register custom losses.
for cls in [MSEWithInequalities]:
CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
CUSTOM_LOSSES[cls.name] = cls()
"""
Layer-sequential unit-variance initialization for neural networks.
See:
Mishkin and Matas, "All you need is a good init". 2016.
https://arxiv.org/abs/1511.06422
"""
#
# LSUV initialization code in this file is adapted from:
# https://github.com/ducha-aiki/LSUV-keras/blob/master/lsuv_init.py
# by Dmytro Mishkin
#
# Here is the license for the original code:
#
#
# Copyright (C) 2017, Dmytro Mishkin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the
# distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import print_function
import numpy
def svd_orthonormal(shape):
# Orthonormal init code is from Lasagne
# https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py
if len(shape) < 2:
raise RuntimeError("Only shapes of length 2 or more are supported.")
flat_shape = (shape[0], numpy.prod(shape[1:]))
a = numpy.random.standard_normal(flat_shape).astype("float32")
u, _, v = numpy.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v
q = q.reshape(shape)
return q
def get_activations(model, layer, X_batch):
from keras.models import Model
intermediate_layer_model = Model(
inputs=model.get_input_at(0),
outputs=layer.get_output_at(0)
)
activations = intermediate_layer_model.predict(X_batch)
return activations
def lsuv_init(model, batch, verbose=True, margin=0.1, max_iter=100):
"""
Initialize neural network weights using layer-sequential unit-variance
initialization.
See:
Mishkin and Matas, "All you need is a good init". 2016.
https://arxiv.org/abs/1511.06422
Parameters
----------
model : keras.Model
batch : dict
Training data, as would be passed keras.Model.fit()
verbose : boolean
Whether to print progress to stdout
margin : float
max_iter : int
Returns
-------
keras.Model
Same as what was passed in.
"""
from keras.layers import Dense, Convolution2D
needed_variance = 1.0
layers_inintialized = 0
for layer in model.layers:
if not isinstance(layer, (Dense, Convolution2D)):
continue
# avoid small layers where activation variance close to zero, esp.
# for small batches
if numpy.prod(layer.get_output_shape_at(0)[1:]) < 32:
if verbose:
print('LSUV initialization skipping', layer.name)
continue
layers_inintialized += 1
weights_and_biases = layer.get_weights()
weights_and_biases[0] = svd_orthonormal(weights_and_biases[0].shape)
layer.set_weights(weights_and_biases)
activations = get_activations(model, layer, batch)
variance = numpy.var(activations)
iteration = 0
if verbose:
print(layer.name, variance)
while abs(needed_variance - variance) > margin:
if verbose:
print(
'LSUV initialization',
layer.name,
iteration,
needed_variance,
margin,
variance)
if numpy.abs(numpy.sqrt(variance)) < 1e-7:
break # avoid zero division
weights_and_biases = layer.get_weights()
weights_and_biases[0] /= numpy.sqrt(variance) / numpy.sqrt(
needed_variance)
layer.set_weights(weights_and_biases)
activations = get_activations(model, layer, batch)
variance = numpy.var(activations)
iteration += 1
if iteration >= max_iter:
break
if verbose:
print('Done with LSUV: total layers initialized', layers_inintialized)
return model
\ No newline at end of file
......@@ -9,9 +9,9 @@ from __future__ import (
)
import logging
import yaml
from os.path import join, exists, relpath
from pipes import quote
from os.path import join, exists
from os import environ
from pipes import quote
from collections import OrderedDict
from appdirs import user_data_dir
from pkg_resources import resource_string
......@@ -81,8 +81,7 @@ def get_default_class1_models_dir(test_exists=True):
if test_exists and not exists(result):
raise IOError("No such directory: %s" % result)
return result
else:
return get_path("models_class1", "models", test_exists=test_exists)
return get_path("models_class1", "models", test_exists=test_exists)
def get_current_release_downloads():
......@@ -160,13 +159,13 @@ def configure():
metadata["releases"][_CURRENT_RELEASE]["compatibility-version"])
current_compatability = metadata["current-compatibility-version"]
if current_release_compatability != current_compatability:
logging.warn(
logging.warning(
"The specified downloads are not compatible with this version "
"of the MHCflurry codebase. Downloads: release %s, "
"compatability version: %d. Code compatability version: %d" % (
_CURRENT_RELEASE,
current_release_compatability,
current_compatability))
"compatability version: %d. Code compatability version: %d",
_CURRENT_RELEASE,
current_release_compatability,
current_compatability)
data_dir = environ.get("MHCFLURRY_DATA_DIR")
if not data_dir:
......@@ -176,6 +175,7 @@ def configure():
data_dir = user_data_dir("mhcflurry", version="4")
_DOWNLOADS_DIR = join(data_dir, _CURRENT_RELEASE)
logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s" % _DOWNLOADS_DIR)
logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s", _DOWNLOADS_DIR)
configure()
......@@ -8,7 +8,7 @@
# by name, the downloads with "default=true" are downloaded.
# This should usually be the latest release.
current-release: 1.2.0
current-release: 1.3.0
# An integer indicating what models the current MHCflurry code base is compatible
# with. Increment this integer when changes are made to MHCflurry that would break
......@@ -17,6 +17,70 @@ current-compatibility-version: 2
# Add new releases here as they are made.
releases:
1.3.0:
compatibility-version: 2
downloads:
- name: models_class1_pan
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan.20190829.tar.bz2
default: false
- name: models_class1_pan_unselected
part_urls:
- https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan_unselected.20190826.tar.bz2.part.aa
- https://github.com/openvax/mhcflurry/releases/download/pan-dev1/models_class1_pan_unselected.20190826.tar.bz2.part.ab
default: false
- name: data_iedb
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_iedb.20190610.tar.bz2
default: false
- name: data_systemhcatlas
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_systemhcatlas.20190506.tar.bz2
default: false
- name: allele_sequences
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/allele_sequences.20190506.tar.bz2
default: false
- name: random_peptide_predictions
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/random_peptide_predictions.20190506.tar.bz2
default: false
- name: data_published
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_published.tar.bz2
default: false
- name: data_curated
url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190516.tar.bz2
default: true
# Older downloads
- name: models_class1
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1.20180225.tar.bz2
default: true
- name: models_class1_selected_no_mass_spec
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_selected_no_mass_spec.20180225.tar.bz2
default: false
- name: models_class1_unselected
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_unselected.20180221.tar.bz2
default: false
- name: models_class1_trained_with_mass_spec
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_trained_with_mass_spec.20180228.tar.bz2
default: false
- name: models_class1_unselected_with_mass_spec
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_unselected_with_mass_spec.20180227.tar.bz2
default: false
- name: models_class1_minimal
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_minimal.20180226.tar.bz2
default: false
1.2.0:
compatibility-version: 2
downloads:
......
......@@ -27,7 +27,8 @@ import os
from pipes import quote
import errno
import tarfile
from tempfile import mkstemp
from shutil import copyfileobj
from tempfile import NamedTemporaryFile
from tqdm import tqdm
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
......@@ -167,8 +168,7 @@ def fetch_subcommand(args):
"\nThe requested download '%s' has already been downloaded. "
"To re-download this data, first run: \n\t%s\nin a shell "
"and then re-run this command.\n" +
"*" * 40)
% (name, 'rm -rf ' + quote(get_path(name))))
"*" * 40) % (name, 'rm -rf ' + quote(get_path(name))))
if not info['downloaded'] and (name in args.download_name or default):
items_to_fetch.add(name)
......@@ -181,27 +181,46 @@ def fetch_subcommand(args):
"DOWNLOAD NAME", "ALREADY DOWNLOADED?", "WILL DOWNLOAD NOW?", "URL"))
for (item, info) in downloads.items():
urls = (
[info['metadata']["url"]]
if "url" in info['metadata']
else info['metadata']["part_urls"])
url_description = urls[0]
if len(urls) > 1:
url_description += " + %d more parts" % (len(urls) - 1)
qprint(format_string % (
item,
yes_no(info['downloaded']),
yes_no(item in items_to_fetch),
info['metadata']["url"]))
url_description))
# TODO: may want to extract into somewhere temporary and then rename to
# avoid making an incomplete extract if the process is killed.
for item in items_to_fetch:
metadata = downloads[item]['metadata']
(temp_fd, target_path) = mkstemp()
urls = (
[metadata["url"]] if "url" in metadata else metadata["part_urls"])
temp = NamedTemporaryFile(delete=False, suffix=".tar.bz2")
try:
qprint("Downloading: %s" % metadata['url'])
urlretrieve(
metadata['url'],
target_path,
reporthook=TqdmUpTo(
unit='B', unit_scale=True, miniters=1).update_to)
qprint("Downloaded to: %s" % quote(target_path))
tar = tarfile.open(target_path, 'r:bz2')
for (url_num, url) in enumerate(urls):
qprint("Downloading [part %d/%d]: %s" % (
url_num + 1, len(urls), url))
(downloaded_path, _) = urlretrieve(
url,
temp.name if len(urls) == 1 else None,
reporthook=TqdmUpTo(
unit='B', unit_scale=True, miniters=1).update_to)
qprint("Downloaded to: %s" % quote(downloaded_path))
if downloaded_path != temp.name:
qprint("Appending to: %s" % temp.name)
with open(downloaded_path, "rb") as fd:
copyfileobj(fd, temp, length=64*1024*1024)
os.remove(downloaded_path)
temp.close()
tar = tarfile.open(temp.name, 'r:bz2')
names = tar.getnames()
logging.debug("Extracting: %s" % names)
bad_names = [
......@@ -221,7 +240,7 @@ def fetch_subcommand(args):
len(names), quote(result_dir)))
finally:
if not args.keep:
os.remove(target_path)
os.remove(temp.name)
def info_subcommand(args):
......@@ -257,10 +276,18 @@ def info_subcommand(args):
print(format_string % ("DOWNLOAD NAME", "DOWNLOADED?", "URL"))
for (item, info) in downloads.items():
urls = (
[info['metadata']["url"]]
if "url" in info['metadata']
else info['metadata']["part_urls"])
url_description = urls[0]
if len(urls) > 1:
url_description += " + %d more parts" % (len(urls) - 1)
print(format_string % (
item,
yes_no(info['downloaded']),
info['metadata']["url"]))
url_description))
def path_subcommand(args):
......
This diff is collapsed.
......@@ -37,4 +37,4 @@ CENTRALITY_MEASURES = {
"mean": partial(numpy.nanmean, axis=1),
"median": partial(numpy.nanmedian, axis=1),
"robust_mean": robust_mean,
}
\ No newline at end of file
}
"""
Hyperparameter (neural network options) management
"""
from __future__ import (
print_function,
division,
......@@ -70,8 +73,7 @@ class HyperparameterDefaults(object):
if invalid_keys:
raise ValueError(
"No such model parameters: %s. Valid parameters are: %s"
% (" ".join(invalid_keys),
" ".join(self.defaults)))
% (" ".join(invalid_keys), " ".join(self.defaults)))
def models_grid(self, **kwargs):
'''
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment