fixed neural network test, fixed length filtering of data loading, started on...

fixed neural network test, fixed length filtering of data loading, started on adding imputation to training script

fixed neural network test, fixed length filtering of data loading, started on...
fixed neural network test, fixed length filtering of data loading, started on adding imputation to training script
df5ac484 · Alex Rubinsteyn · f3ec930d · df5ac484 · df5ac484 · df5ac484
Commit df5ac484 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -68,13 +68,12 @@ def _infer_csv_separator(filename):

 def load_dataframe(
        filename,
-        peptide_length=None,
        max_ic50=MAX_IC50,
        sep=None,
        species_column_name="species",
        allele_column_name="mhc",
        peptide_column_name=None,
-        peptide_length_column_name="peptide_length",
+        filter_peptide_length=None,
        ic50_column_name="meas",
        only_human=False):
    """
@@ -88,9 +87,6 @@ def load_dataframe(
            - 'sequence'
            - 'meas'

-     peptide_length : int, optional
-        Which length peptides to use (default=load all lengths)
-
    max_ic50 : float
        Treat IC50 scores above this value as all equally bad
        (transform them to 0.0 in the regression output)
@@ -101,6 +97,9 @@ def load_dataframe(
    peptide_column_name : str, optional
        Default behavior is to try  {"sequence", "peptide", "peptide_sequence"}

+    filter_peptide_length : int, optional
+        Which length peptides to use (default=load all lengths)
+
    only_human : bool
        Only load entries from human MHC alleles

@@ -131,8 +130,9 @@ def load_dataframe(
    if only_human:
        human_mask = df[species_column_name] == "human"
        df = df[human_mask]
-    if peptide_length is not None:
-        length_mask = df[peptide_length_column_name] == peptide_length
+
+    if filter_peptide_length:
+        length_mask = df[peptide_column_name].str.len() == filter_peptide_length
        df = df[length_mask]

    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
@@ -383,11 +383,10 @@ def load_allele_datasets(
        filename=filename,
        max_ic50=max_ic50,
        sep=sep,
-        peptide_length=peptide_length,
        species_column_name=species_column_name,
        allele_column_name=allele_column_name,
        peptide_column_name=peptide_column_name,
-        peptide_length_column_name=peptide_length_column_name,
+        filter_peptide_length=None if use_multiple_peptide_lengths else peptide_length,
        ic50_column_name=ic50_column_name,
        only_human=only_human)


--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
@@ -19,7 +19,16 @@ from __future__ import (
 )
 from collections import defaultdict
 import numpy as np
-from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
+from fancyimpute.dictionary_helpers import (
+    dense_matrix_from_nested_dictionary
+)
+from fancyimpute import (
+    KNN,
+    IterativeSVD,
+    SimpleFill,
+    SoftImpute,
+    MICE
+)

 from .data import (
    create_allele_data_from_peptide_to_ic50_dict,
@@ -190,3 +199,32 @@ def create_imputed_datasets(
        for (allele_name, allele_dict)
        in allele_to_peptide_to_affinity_dict.items()
    }
+
+def imputer_from_name(imputation_method_name, **kwargs):
+    """
+    Helper function for constructing an imputation object from a name given
+    typically from a commandline argument.
+    """
+    imputation_method_name = imputation_method_name.strip().lower()
+    if imputation_method_name == "mice":
+        kwargs["n_burn_in"] = kwargs.get("n_burn_in", 5)
+        kwargs["n_imputations"] = kwargs.get("n_imputations", 25)
+        kwargs["min_value"] = kwargs.get("min_value", 0)
+        kwargs["max_value"] = kwargs.get("max_value", 1)
+        return MICE(**kwargs)
+    elif imputation_method_name == "knn":
+        kwargs["k"] = kwargs.get("k", 3)
+        kwargs["orientation"] = kwargs.get("orientation", "columns")
+        kwargs["print_interval"] = kwargs.get("print_interval", 10)
+        return KNN(**kwargs)
+    elif imputation_method_name == "svd":
+        kwargs["rank"] = kwargs.get("rank", 10)
+        return IterativeSVD(**kwargs)
+    elif imputation_method_name == "svt" or imputation_method_name == "softimpute":
+        return SoftImpute(**kwargs)
+    elif imputation_method_name == "mean":
+        return SimpleFill("mean", **kwargs)
+    elif imputation_method_name == "none":
+        return None
+    else:
+        raise ValueError("Invalid imputation method: %s" % imputation_method_name)
--- a/script/mhcflurry-train-class1-allele-specific-models.py
+++ b/script/mhcflurry-train-class1-allele-specific-models.py
@@ -53,6 +53,8 @@ from mhcflurry.paths import (
    CLASS1_MODEL_DIRECTORY,
    CLASS1_DATA_DIRECTORY
 )
+from mhcflurry.imputation import imputer_from_name, create_imputed_datasets
+
 CSV_FILENAME = "combined_human_class1_dataset.csv"
 CSV_PATH = join(CLASS1_DATA_DIRECTORY, CSV_FILENAME)

@@ -92,16 +94,24 @@ parser.add_argument(
    nargs="+",
    type=normalize_allele_name)

+parser.add_argument(
+    "--imputation-method",
+    default=None,
+    choices=("mice", "knn", "softimpute", "svd", "mean"),
+    type=imputer_from_name,
+    help="Use the given imputation method to generate data for pre-training models")
+
 # add options for neural network hyperparameters
 parser = add_hyperparameter_arguments_to_parser(parser)

 if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
+
    if not exists(args.output_dir):
        makedirs(args.output_dir)

-    allele_groups = load_allele_datasets(
+    allele_data_dict = load_allele_datasets(
        filename=args.binding_data_csv,
        peptide_length=9,
        use_multiple_peptide_lengths=True,
@@ -111,15 +121,21 @@ if __name__ == "__main__":

    # concatenate datasets from all alleles to use for pre-training of
    # allele-specific predictors
-    X_all = np.vstack([group.X_index for group in allele_groups.values()])
-    Y_all = np.concatenate([group.Y for group in allele_groups.values()])
+    X_all = np.vstack([group.X_index for group in allele_data_dict.values()])
+    Y_all = np.concatenate([group.Y for group in allele_data_dict.values()])
    print("Total Dataset size = %d" % len(Y_all))

+    if args.imputation_method is not None:
+        # TODO: use imputed data for training
+        imputed_data_dict = create_imputed_datasets(
+            allele_data_dict,
+            args.imputation_method)
+
    # if user didn't specify alleles then train models for all available alleles
    alleles = args.alleles

    if not alleles:
-        alleles = sorted(allele_groups.keys())
+        alleles = sorted(allele_data_dict.keys())

    for allele_name in alleles:
        allele_name = normalize_allele_name(allele_name)
@@ -127,7 +143,7 @@ if __name__ == "__main__":
            print("Skipping allele %s" % (allele_name,))
            continue

-        allele_data = allele_groups[allele_name]
+        allele_data = allele_data_dict[allele_name]
        X = allele_data.X_index
        Y = allele_data.Y


--- a/test/test_neural_nets.py
+++ b/test/test_neural_nets.py
@@ -11,7 +11,7 @@ def test_make_embedding_network():
        activation="tanh",
        embedding_input_dim=3,
        embedding_output_dim=20,
-        learning_rate=0.1)
+        learning_rate=0.05)

    X_negative = np.array([
        [0] * 3,
@@ -33,7 +33,7 @@ def test_make_embedding_network():
    ])
    X_index = np.vstack([X_negative, X_positive])
    Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive))
-    nn.fit(X_index, Y, nb_epoch=10)
+    nn.fit(X_index, Y, nb_epoch=20)
    Y_pred = nn.predict(X_index)
    print(Y)
    print(Y_pred)
@@ -46,7 +46,7 @@ def test_make_hotshot_network():
        activation="relu",
        layer_sizes=[4],
        n_amino_acids=2,
-        learning_rate=0.1)
+        learning_rate=0.05)
    X_binary = np.array([
        [True, False, True, False, True, False],
        [True, False, True, False, False, True],
@@ -56,9 +56,16 @@ def test_make_hotshot_network():
        [False, True, True, False, False, True],
    ], dtype=bool)
    Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
-    nn.fit(X_binary, Y, nb_epoch=10)
+    nn.fit(X_binary, Y, nb_epoch=20)
    Y_pred = nn.predict(X_binary)
    print(Y)
    print(Y_pred)
    for (Y_i, Y_pred_i) in zip(Y, Y_pred):
-        assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
+        if Y_i:
+            assert Y_pred_i >= 0.6, "Expected higher value than %f" % Y_pred_i
+        else:
+            assert Y_pred_i <= 0.4, "Expected lower value than %f" % Y_pred_i
+
+if __name__ == "__main__":
+    test_make_hotshot_network()
+    test_make_embedding_network()