Skip to content
Snippets Groups Projects
Commit df5ac484 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

fixed neural network test, fixed length filtering of data loading, started on...

fixed neural network test, fixed length filtering of data loading, started on adding imputation to training script
parent f3ec930d
No related branches found
No related tags found
No related merge requests found
......@@ -68,13 +68,12 @@ def _infer_csv_separator(filename):
def load_dataframe(
filename,
peptide_length=None,
max_ic50=MAX_IC50,
sep=None,
species_column_name="species",
allele_column_name="mhc",
peptide_column_name=None,
peptide_length_column_name="peptide_length",
filter_peptide_length=None,
ic50_column_name="meas",
only_human=False):
"""
......@@ -88,9 +87,6 @@ def load_dataframe(
- 'sequence'
- 'meas'
peptide_length : int, optional
Which length peptides to use (default=load all lengths)
max_ic50 : float
Treat IC50 scores above this value as all equally bad
(transform them to 0.0 in the regression output)
......@@ -101,6 +97,9 @@ def load_dataframe(
peptide_column_name : str, optional
Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
filter_peptide_length : int, optional
Which length peptides to use (default=load all lengths)
only_human : bool
Only load entries from human MHC alleles
......@@ -131,8 +130,9 @@ def load_dataframe(
if only_human:
human_mask = df[species_column_name] == "human"
df = df[human_mask]
if peptide_length is not None:
length_mask = df[peptide_length_column_name] == peptide_length
if filter_peptide_length:
length_mask = df[peptide_column_name].str.len() == filter_peptide_length
df = df[length_mask]
df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
......@@ -383,11 +383,10 @@ def load_allele_datasets(
filename=filename,
max_ic50=max_ic50,
sep=sep,
peptide_length=peptide_length,
species_column_name=species_column_name,
allele_column_name=allele_column_name,
peptide_column_name=peptide_column_name,
peptide_length_column_name=peptide_length_column_name,
filter_peptide_length=None if use_multiple_peptide_lengths else peptide_length,
ic50_column_name=ic50_column_name,
only_human=only_human)
......
......@@ -19,7 +19,16 @@ from __future__ import (
)
from collections import defaultdict
import numpy as np
from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
from fancyimpute.dictionary_helpers import (
dense_matrix_from_nested_dictionary
)
from fancyimpute import (
KNN,
IterativeSVD,
SimpleFill,
SoftImpute,
MICE
)
from .data import (
create_allele_data_from_peptide_to_ic50_dict,
......@@ -190,3 +199,32 @@ def create_imputed_datasets(
for (allele_name, allele_dict)
in allele_to_peptide_to_affinity_dict.items()
}
def imputer_from_name(imputation_method_name, **kwargs):
"""
Helper function for constructing an imputation object from a name given
typically from a commandline argument.
"""
imputation_method_name = imputation_method_name.strip().lower()
if imputation_method_name == "mice":
kwargs["n_burn_in"] = kwargs.get("n_burn_in", 5)
kwargs["n_imputations"] = kwargs.get("n_imputations", 25)
kwargs["min_value"] = kwargs.get("min_value", 0)
kwargs["max_value"] = kwargs.get("max_value", 1)
return MICE(**kwargs)
elif imputation_method_name == "knn":
kwargs["k"] = kwargs.get("k", 3)
kwargs["orientation"] = kwargs.get("orientation", "columns")
kwargs["print_interval"] = kwargs.get("print_interval", 10)
return KNN(**kwargs)
elif imputation_method_name == "svd":
kwargs["rank"] = kwargs.get("rank", 10)
return IterativeSVD(**kwargs)
elif imputation_method_name == "svt" or imputation_method_name == "softimpute":
return SoftImpute(**kwargs)
elif imputation_method_name == "mean":
return SimpleFill("mean", **kwargs)
elif imputation_method_name == "none":
return None
else:
raise ValueError("Invalid imputation method: %s" % imputation_method_name)
......@@ -53,6 +53,8 @@ from mhcflurry.paths import (
CLASS1_MODEL_DIRECTORY,
CLASS1_DATA_DIRECTORY
)
from mhcflurry.imputation import imputer_from_name, create_imputed_datasets
CSV_FILENAME = "combined_human_class1_dataset.csv"
CSV_PATH = join(CLASS1_DATA_DIRECTORY, CSV_FILENAME)
......@@ -92,16 +94,24 @@ parser.add_argument(
nargs="+",
type=normalize_allele_name)
parser.add_argument(
"--imputation-method",
default=None,
choices=("mice", "knn", "softimpute", "svd", "mean"),
type=imputer_from_name,
help="Use the given imputation method to generate data for pre-training models")
# add options for neural network hyperparameters
parser = add_hyperparameter_arguments_to_parser(parser)
if __name__ == "__main__":
args = parser.parse_args()
print(args)
if not exists(args.output_dir):
makedirs(args.output_dir)
allele_groups = load_allele_datasets(
allele_data_dict = load_allele_datasets(
filename=args.binding_data_csv,
peptide_length=9,
use_multiple_peptide_lengths=True,
......@@ -111,15 +121,21 @@ if __name__ == "__main__":
# concatenate datasets from all alleles to use for pre-training of
# allele-specific predictors
X_all = np.vstack([group.X_index for group in allele_groups.values()])
Y_all = np.concatenate([group.Y for group in allele_groups.values()])
X_all = np.vstack([group.X_index for group in allele_data_dict.values()])
Y_all = np.concatenate([group.Y for group in allele_data_dict.values()])
print("Total Dataset size = %d" % len(Y_all))
if args.imputation_method is not None:
# TODO: use imputed data for training
imputed_data_dict = create_imputed_datasets(
allele_data_dict,
args.imputation_method)
# if user didn't specify alleles then train models for all available alleles
alleles = args.alleles
if not alleles:
alleles = sorted(allele_groups.keys())
alleles = sorted(allele_data_dict.keys())
for allele_name in alleles:
allele_name = normalize_allele_name(allele_name)
......@@ -127,7 +143,7 @@ if __name__ == "__main__":
print("Skipping allele %s" % (allele_name,))
continue
allele_data = allele_groups[allele_name]
allele_data = allele_data_dict[allele_name]
X = allele_data.X_index
Y = allele_data.Y
......
......@@ -11,7 +11,7 @@ def test_make_embedding_network():
activation="tanh",
embedding_input_dim=3,
embedding_output_dim=20,
learning_rate=0.1)
learning_rate=0.05)
X_negative = np.array([
[0] * 3,
......@@ -33,7 +33,7 @@ def test_make_embedding_network():
])
X_index = np.vstack([X_negative, X_positive])
Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive))
nn.fit(X_index, Y, nb_epoch=10)
nn.fit(X_index, Y, nb_epoch=20)
Y_pred = nn.predict(X_index)
print(Y)
print(Y_pred)
......@@ -46,7 +46,7 @@ def test_make_hotshot_network():
activation="relu",
layer_sizes=[4],
n_amino_acids=2,
learning_rate=0.1)
learning_rate=0.05)
X_binary = np.array([
[True, False, True, False, True, False],
[True, False, True, False, False, True],
......@@ -56,9 +56,16 @@ def test_make_hotshot_network():
[False, True, True, False, False, True],
], dtype=bool)
Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
nn.fit(X_binary, Y, nb_epoch=10)
nn.fit(X_binary, Y, nb_epoch=20)
Y_pred = nn.predict(X_binary)
print(Y)
print(Y_pred)
for (Y_i, Y_pred_i) in zip(Y, Y_pred):
assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i)
if Y_i:
assert Y_pred_i >= 0.6, "Expected higher value than %f" % Y_pred_i
else:
assert Y_pred_i <= 0.4, "Expected lower value than %f" % Y_pred_i
if __name__ == "__main__":
test_make_hotshot_network()
test_make_embedding_network()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment