fixes

d47ba9ca · Tim O'Donnell · 06b6ecc6 · d47ba9ca · d47ba9ca · d47ba9ca
Commit d47ba9ca authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -10,7 +10,6 @@ import pandas
 from .hyperparameters import HyperparameterDefaults
 from .encodable_sequences import EncodableSequences, EncodingError
-from .amino_acid import available_vector_encodings, vector_encoding_length
 from .regression_target import to_ic50, from_ic50
 from .common import random_peptides, amino_acid_distribution
 from .custom_loss import get_loss
@@ -156,7 +155,6 @@ class Class1NeuralNetwork(object):
                    hyperparameters[to_name] = value
        return hyperparameters
    def __init__(self, **hyperparameters):
        self.hyperparameters = self.hyperparameter_defaults.with_defaults(
            self.apply_hyperparameter_renames(hyperparameters))
@@ -420,7 +418,6 @@ class Class1NeuralNetwork(object):
            allele_encoding.allele_representations(
                self.hyperparameters['allele_amino_acid_encoding']))
    def fit_generator(
            self,
            generator,
@@ -544,7 +541,6 @@ class Class1NeuralNetwork(object):
        fit_info["num_points"] = yielded_values_box[0]
        self.fit_info.append(dict(fit_info))
    def fit(
            self,
            peptides,

--- a/mhcflurry/select_pan_allele_models_command.py
+++ b/mhcflurry/select_pan_allele_models_command.py
@@ -232,19 +232,31 @@ def run(argv=sys.argv[1:]):
            chunksize=1)
    models_by_fold = {}
+    summary_dfs = []
    for result in tqdm.tqdm(results, total=len(work_items)):
        pprint(result)
        fold_num = result['fold_num']
+        (all_models_for_fold, _) = folds_to_predictors[fold_num]
        models = [
-            folds_to_predictors[fold_num][0][i]
+            all_models_for_fold[i]
            for i in result['selected_indices']
        ]
+        summary_df = result['summary'].copy()
+        summary_df.index = summary_df.index.map(
+            lambda idx: all_models_for_fold[idx])
+        summary_dfs.append(summary_df)
        print("Selected %d models for fold %d: %s" % (
            len(models), fold_num, result['selected_indices']))
        models_by_fold[fold_num] = models
        for model in models:
            result_predictor.add_pan_allele_model(model)
+    summary_df = pandas.concat(summary_dfs, ignore_index=False)
+    summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config())
+    result_predictor.metadata_dataframes["model_selection_summary"] = (
+        summary_df.reset_index(drop=True))
    result_predictor.save(args.out_models_dir)
    model_selection_time = time.time() - start
@@ -312,11 +324,16 @@ def model_select(fold_num, models, min_models, max_models):
            break
    assert selected
+    summary_df = pandas.Series(individual_model_scores)[
+        numpy.arange(len(models))
+    ].to_frame()
+    summary_df.columns = ['mse_score']
    return {
        'fold_num': fold_num,
        'selected_indices': selected,
-        'individual_model_scores': pandas.Series(
+        'summary': summary_df,  # indexed by model index
-            individual_model_scores)[numpy.arange(len(models))],
    }

--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -15,8 +15,6 @@ from functools import partial
 import numpy
 import pandas
 import yaml
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.model_selection import StratifiedKFold
 from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
@@ -28,11 +26,8 @@ from .parallelism import (
    add_worker_pool_args,
    worker_pool_with_gpu_assignments_from_args,
    call_wrapped_kwargs)
-from .hyperparameters import HyperparameterDefaults
 from .allele_encoding import AlleleEncoding
 from .encodable_sequences import EncodableSequences
-from .regression_target import to_ic50, from_ic50
-from .import custom_loss
 # To avoid pickling large matrices to send to child processes when running in
@@ -173,10 +168,6 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max):
    print("Test points per fold")
    print((~result_df).sum())
-    result_df["allele"] = df["allele"]
-    result_df["peptide"] = df["peptide"]
    return result_df
@@ -304,10 +295,13 @@ def main(args):
    predictor = Class1AffinityPredictor(
        allele_to_sequence=allele_encoding.allele_to_sequence,
        metadata_dataframes={
-            'train_data': df,
+            'train_data': pandas.merge(
-            'training_folds': folds_df,
+                df,
+                folds_df,
+                left_index=True,
+                right_index=True)
        })
-    serial_run = args.num_jobs == 1
+    serial_run = args.num_jobs == 0
    work_items = []
    for (h, hyperparameters) in enumerate(hyperparameters_lst):
@@ -353,6 +347,7 @@ def main(args):
    if worker_pool:
        print("Processing %d work items in parallel." % len(work_items))
+        assert not serial_run
        results_generator = worker_pool.imap_unordered(
            partial(call_wrapped_kwargs, train_model),
@@ -389,6 +384,7 @@ def main(args):
        # which it adds models to, so no merging is required. It also saves
        # as it goes so no saving is required at the end.
        print("Processing %d work items in serial." % len(work_items))
+        assert serial_run
        for _ in tqdm.trange(len(work_items)):
            item = work_items.pop(0)  # want to keep freeing up memory
            work_predictor = train_model(**item)