dont' add --max-ic50 to args twice

95de1b69 · Alex Rubinsteyn · f7039fb4 · 95de1b69 · 95de1b69 · f7039fb4
Commit 95de1b69 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/args.py
+++ b/mhcflurry/args.py
@@ -39,18 +39,6 @@ def add_imputation_argument_to_parser(parser):
        help="Use the given imputation method to generate data for pre-training models")
    return parser

-def add_max_ic50_argument_to_parser(parser):
-    """
-    Extends an argument parser with --max-ic50
-    """
-
-    parser.add_argument(
-        "--max-ic50",
-        type=float,
-        default=MAX_IC50,
-        help="Largest IC50 represented by neural network output. "
-        "Default: %(default)s")
-    return parser

 def add_hyperparameter_arguments_to_parser(parser):
    """
@@ -104,8 +92,8 @@ def add_hyperparameter_arguments_to_parser(parser):
        "--max-ic50",
        type=float,
        default=MAX_IC50,
-        help="Largest IC50 value representable as output of neural network")
-
+        help="Largest IC50 represented by neural network output. "
+        "Default: %(default)s")
    return parser

 def add_training_arguments_to_parser(parser):
@@ -160,7 +148,6 @@ def add_arguments_to_parser(parser):
    functions = [
        add_hyperparameter_arguments_to_parser,
        add_training_arguments_to_parser,
-        add_max_ic50_argument_to_parser,
        add_imputation_argument_to_parser,
    ]
    for fn in functions:

--- a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py
+++ b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py
@@ -171,6 +171,16 @@ class Class1AlleleSpecificKmerIC50PredictorBase(IC50PredictorBase):
            Extra arguments are passed on to the fit_encoded_kmer_arrays()
            method.
        """
+        if len(dataset.unique_alleles()) > 1:
+            raise ValueError(
+                "Allele-specific predictor can't be trained on multi-allele data: %s" % (
+                    dataset,))
+
+        if pretraining_dataset and len(pretraining_dataset.unique_alleles()) > 1:
+            raise ValueError(
+                "Allele-specific predictor can't pretrain on data from multiple alleles: %s" %
+                (pretraining_dataset,))
+
        X, ic50, sample_weights, original_peptide_indices = \
            dataset.kmer_index_encoding(
                kmer_size=self.kmer_size,

--- a/mhcflurry/score_set.py
+++ b/mhcflurry/score_set.py
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-from collections import OrderedDict
-
-import numpy as np
-
-
-class ScoreSet(object):
-    """
-    Useful for keeping a collection of score dictionaries
-    which map name->score type->list of values.
-    """
-    def __init__(self, verbose=True, index="name"):
-        self.groups = {}
-        self.verbose = verbose
-        if isinstance(index, (list, tuple)):
-            index = ",".join("%s" % item for item in index)
-        self.index = index
-
-    def add_many(self, group, **kwargs):
-        for (k, v) in sorted(kwargs.items()):
-            self.add(group, k, v)
-
-    def add(self, group, score_type, value):
-        if isinstance(group, (list, tuple)):
-            group = ",".join("%s" % item for item in group)
-        if group not in self.groups:
-            self.groups[group] = {}
-        if score_type not in self.groups[group]:
-            self.groups[group][score_type] = []
-        self.groups[group][score_type].append(value)
-        if self.verbose:
-            print("--> %s:%s %0.4f" % (group, score_type, value))
-
-    def score_types(self):
-        result = set([])
-        for (g, d) in sorted(self.groups.items()):
-            for score_type in sorted(d.keys()):
-                result.add(score_type)
-        return list(sorted(result))
-
-    def _reduce_scores(self, reduce_fn):
-        score_types = self.score_types()
-        return {
-            group:
-                OrderedDict([
-                    (score_type, reduce_fn(score_dict[score_type]))
-                    for score_type
-                    in score_types
-                ])
-            for (group, score_dict)
-            in self.groups.items()
-        }
-
-    def averages(self):
-        return self._reduce_scores(np.mean)
-
-    def stds(self):
-        return self._reduce_scores(np.std)
-
-    def to_csv(self, filename):
-        with open(filename, "w") as f:
-            header_list = [self.index]
-            score_types = self.score_types()
-            for score_type in score_types:
-                header_list.append(score_type)
-                header_list.append(score_type + "_std")
-
-            header_line = ",".join(header_list) + "\n"
-            if self.verbose:
-                print(header_line)
-            f.write(header_line)
-
-            score_averages = self.averages()
-            score_stds = self.stds()
-
-            for name in sorted(score_averages.keys()):
-                line_elements = [name]
-                for score_type in score_types:
-                    line_elements.append(
-                        "%0.4f" % score_averages[name][score_type])
-                    line_elements.append(
-                        "%0.4f" % score_stds[name][score_type])
-                line = ",".join(line_elements) + "\n"
-                if self.verbose:
-                    print(line)
-                f.write(line)
--- a/script/mhcflurry-dataset-size-sensitivity.py
+++ b/script/mhcflurry-dataset-size-sensitivity.py
@@ -113,7 +113,7 @@ def subsample_performance(
    for i, n_train in enumerate(sample_sizes):
        for _ in range(n_repeats_per_size):
            if imputer is None:
-                dataset_train, dataset_test = dataset.random_split(n_train)
+                dataset_train, dataset_test = dataset_allele.random_split(n_train)
                dataset_imputed = None
            else:
                dataset_train, dataset_imputed, dataset_test = \
@@ -128,6 +128,9 @@ def subsample_performance(
                allele,
                n_train,
                n_total))
+            print("-- Train", dataset_train)
+            print("-- Imputed", dataset_imputed)
+            print("-- Test", dataset_test)

            # pick a fraction on a log-scale from the minimum to maximum number
            # of samples