Skip to content
Snippets Groups Projects
Commit de6d1864 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

use imputed X, Y, weights in training

parent a11524f2
No related branches found
No related tags found
No related merge requests found
......@@ -21,16 +21,12 @@ from collections import defaultdict
import logging
import numpy as np
from fancyimpute.dictionary_helpers import (
dense_matrix_from_nested_dictionary
)
from fancyimpute import (
KNN,
IterativeSVD,
SimpleFill,
SoftImpute,
MICE
)
from fancyimpute.knn import KNN
from fancyimpute.iterative_svd import IterativeSVD
from fancyimpute.simple_fill import SimpleFill
from fancyimpute.soft_impute import SoftImpute
from fancyimpute.mice import MICE
from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
from .data import (
create_allele_data_from_peptide_to_ic50_dict,
......
......@@ -53,7 +53,7 @@ from mhcflurry.paths import (
CLASS1_MODEL_DIRECTORY,
CLASS1_DATA_DIRECTORY
)
from mhcflurry.imputation import imputer_from_name, create_imputed_datasets
from mhcflurry.imputation import create_imputed_datasets, imputer_from_name
CSV_FILENAME = "combined_human_class1_dataset.csv"
CSV_PATH = join(CLASS1_DATA_DIRECTORY, CSV_FILENAME)
......@@ -98,7 +98,7 @@ parser.add_argument(
"--imputation-method",
default=None,
choices=("mice", "knn", "softimpute", "svd", "mean"),
type=imputer_from_name,
type=lambda s: s.strip().lower(),
help="Use the given imputation method to generate data for pre-training models")
# add options for neural network hyperparameters
......@@ -125,8 +125,14 @@ if __name__ == "__main__":
Y_all = np.concatenate([group.Y for group in allele_data_dict.values()])
print("Total Dataset size = %d" % len(Y_all))
if args.imputation_method is not None:
# TODO: use imputed data for training
if args.imputation_method is None:
imputer = None
else:
imputer = imputer_from_name(args.imputation_method)
if imputer is None:
imputed_data_dict = {}
else:
imputed_data_dict = create_imputed_datasets(
allele_data_dict,
args.imputation_method)
......@@ -138,17 +144,30 @@ if __name__ == "__main__":
alleles = sorted(allele_data_dict.keys())
for allele_name in alleles:
allele_name = normalize_allele_name(allele_name)
if allele_name.isdigit():
print("Skipping allele %s" % (allele_name,))
continue
allele_data = allele_data_dict[allele_name]
X = allele_data.X_index
Y = allele_data.Y
weights = allele_data.weights
n_allele = len(allele_data.Y)
assert len(X) == n_allele
assert len(weights) == n_allele
if allele_name in imputed_data_dict:
imputed_data = imputed_data_dict[allele_name]
X_pretrain = imputed_data.X_index
Y_pretrain = imputed_data.Y
weights_pretrain = imputed_data.weights
else:
X_pretrain = None
Y_pretrain = None
weights_pretrain = None
# normalize allele name to check if it's just
allele_name = normalize_allele_name(allele_name)
if allele_name.isdigit():
print("Skipping allele %s" % (allele_name,))
continue
print("\n=== Training predictor for %s: %d samples, %d unique" % (
allele_name,
......@@ -189,8 +208,12 @@ if __name__ == "__main__":
remove(hdf_path)
model.fit(
allele_data.X_index,
allele_data.Y,
X=allele_data.X_index,
Y=allele_data.Y,
sample_weights=weights,
X_pretrain=X_pretrain,
Y_pretrain=Y_pretrain,
sample_weights_pretrain=weights_pretrain,
n_training_epochs=args.training_epochs,
verbose=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment