Skip to content
Snippets Groups Projects
Commit fd2318cd authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

update coveralls badge

parent 9fa2c0a7
No related branches found
No related tags found
No related merge requests found
...@@ -131,36 +131,56 @@ def evaluate_predictions( ...@@ -131,36 +131,56 @@ def evaluate_predictions(
return mae, tau, auc, f1_score return mae, tau, auc, f1_score
if __name__ == "__main__": def create_imputation_methods(
args = parser.parse_args() verbose=False,
print(args) clip_imputed_values=False,
knn_print_interval=20,
imputation_methods = { knn_params=[1, 3, 5],
"softImpute": SoftImpute(verbose=args.verbose), softimpute_params=[1, 5, 10],
"svdImpute-5": IterativeSVD(5, verbose=args.verbose), svd_params=[5, 10, 20]):
"svdImpute-10": IterativeSVD(10, verbose=args.verbose), min_value = 0 if clip_imputed_values else None
"svdImpute-20": IterativeSVD(20, verbose=args.verbose), max_value = 1 if clip_imputed_values else None
"similarityWeightedAveraging": SimilarityWeightedAveraging( result_dict = {
orientation="columns",
verbose=args.verbose),
"meanFill": SimpleFill("mean"), "meanFill": SimpleFill("mean"),
"zeroFill": SimpleFill("zero"), "zeroFill": SimpleFill("zero"),
"MICE": MICE( "mice": MICE(
n_burn_in=5, n_burn_in=5,
n_imputations=25, n_imputations=25,
min_value=None if args.normalize_rows or args.normalize_columns else 0, min_value=min_value,
max_value=None if args.normalize_rows or args.normalize_columns else 1, max_value=max_value,
verbose=args.verbose), verbose=verbose),
"knnImpute-3": KNN(3, orientation="columns", verbose=args.verbose, print_interval=20), "similarityWeightedAveraging": SimilarityWeightedAveraging(
"knnImpute-7": KNN(7, orientation="columns", verbose=args.verbose, print_interval=20), orientation="columns",
"knnImpute-15": KNN(15, orientation="columns", verbose=args.verbose, print_interval=20), verbose=verbose),
} }
for threshold in softimpute_params:
result_dict["softImpute-%d" % threshold] = SoftImpute(
threshold,
verbose=verbose,
min_value=min_value,
max_value=max_value)
for rank in svd_params:
result_dict["svdImpute-%d" % rank] = IterativeSVD(
rank,
verbose=verbose,
min_value=min_value,
max_value=max_value)
for k in knn_params:
result_dict["knnImpute-%d" % k] = KNN(
k,
orientation="columns",
verbose=verbose,
print_interval=knn_print_interval)
return result_dict
def load_data(binding_data_csv, max_ic50, only_human=False, min_allele_size=1):
allele_to_peptide_to_affinity = load_allele_dicts( allele_to_peptide_to_affinity = load_allele_dicts(
args.binding_data_csv, binding_data_csv,
max_ic50=args.max_ic50, max_ic50=max_ic50,
only_human=args.only_human, only_human=only_human,
regression_output=True) regression_output=True,
min_allele_size=min_allele_size)
peptide_to_allele_to_affinity = transpose_nested_dictionary( peptide_to_allele_to_affinity = transpose_nested_dictionary(
allele_to_peptide_to_affinity) allele_to_peptide_to_affinity)
n_binding_values = sum( n_binding_values = sum(
...@@ -172,25 +192,40 @@ if __name__ == "__main__": ...@@ -172,25 +192,40 @@ if __name__ == "__main__":
n_binding_values, n_binding_values,
len(allele_to_peptide_to_affinity))) len(allele_to_peptide_to_affinity)))
X, peptide_order, allele_order = \ X, peptide_list, allele_list = \
dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
if args.save_incomplete_affinity_matrix:
print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
column_names = [None] * len(allele_order)
for (name, position) in allele_order.items():
column_names[position] = name
row_names = [None] * len(peptide_order)
for (name, position) in peptide_order.items():
row_names[position] = name
df = pd.DataFrame(X, columns=column_names, index=row_names)
df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
scores = ScoreSet()
missing_mask = np.isnan(X) missing_mask = np.isnan(X)
observed_mask = ~missing_mask observed_mask = ~missing_mask
n_observed_per_peptide = observed_mask.sum(axis=1)
min_observed_per_peptide = n_observed_per_peptide.min()
min_peptide_indices = np.where(
n_observed_per_peptide == min_observed_per_peptide)[0]
print("%d peptides with %d observations" % (
len(min_peptide_indices),
min_observed_per_peptide))
n_observed_per_allele = observed_mask.sum(axis=0)
min_observed_per_allele = n_observed_per_allele.min()
min_allele_indices = np.where(
n_observed_per_allele == min_observed_per_allele)[0]
print("%d alleles with %d observations: %s" % (
len(min_allele_indices),
min_observed_per_allele,
[allele_list[i] for i in min_allele_indices]))
return X, missing_mask, observed_mask, peptide_list, allele_list
def index_counts(indices):
max_index = indices.max()
counts = np.zeros(max_index + 1, dtype=int)
for index in indices:
counts[index] += 1
return counts
def stratified_cross_validation(X, observed_mask, n_folds=10):
n_observed = observed_mask.sum() n_observed = observed_mask.sum()
(observed_peptide_index, observed_allele_index) = np.where(observed_mask) (observed_peptide_index, observed_allele_index) = np.where(observed_mask)
...@@ -200,18 +235,26 @@ if __name__ == "__main__": ...@@ -200,18 +235,26 @@ if __name__ == "__main__":
assert len(observed_indices) == n_observed assert len(observed_indices) == n_observed
observed_allele_counts = observed_mask.sum(axis=0)
print("# observed per allele: %s" % (observed_allele_counts,))
assert (index_counts(observed_allele_index) == observed_allele_counts).all()
kfold = StratifiedKFold( kfold = StratifiedKFold(
observed_allele_index, observed_allele_index,
n_folds=args.n_folds, n_folds=n_folds,
shuffle=True) shuffle=True)
for fold_idx, (_, indirect_test_indices) in enumerate(kfold): for (_, indirect_test_indices) in kfold:
test_linear_indices = observed_indices[indirect_test_indices] test_linear_indices = observed_indices[indirect_test_indices]
test_coords = np.unravel_index( test_coords = np.unravel_index(
test_linear_indices, test_linear_indices,
dims=observed_mask.shape) dims=observed_mask.shape)
y_true = X[test_coords]
test_allele_counts = index_counts(test_coords[1])
allele_fractions = test_allele_counts / observed_allele_counts.astype(float)
print("Fraction of each allele in this CV fold: %s" % (allele_fractions,))
X_test_vector = X[test_coords]
X_fold = X.copy() X_fold = X.copy()
X_fold[test_coords] = np.nan X_fold[test_coords] = np.nan
...@@ -229,7 +272,34 @@ if __name__ == "__main__": ...@@ -229,7 +272,34 @@ if __name__ == "__main__":
print("Dropping %d empty rows, %d empty columns" % ( print("Dropping %d empty rows, %d empty columns" % (
empty_row_mask.sum(), empty_row_mask.sum(),
empty_col_mask.sum())) empty_col_mask.sum()))
yield (X_fold, ok_mesh, test_coords, X_test_vector)
if __name__ == "__main__":
args = parser.parse_args()
print(args)
imputation_methods = create_imputation_methods(
verbose=args.verbose,
clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
)
print("Imputation methods: %s" % imputation_methods)
X, missing_mask, observed_mask, peptide_list, allele_list = load_data(
binding_data_csv=args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human,
min_allele_size=args.n_folds)
if args.save_incomplete_affinity_matrix:
print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")
scores = ScoreSet()
kfold = stratified_cross_validation(
X=X,
observed_mask=observed_mask,
n_folds=args.n_folds)
for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
X_fold_reduced = X_fold[ok_mesh] X_fold_reduced = X_fold[ok_mesh]
biscaler = BiScaler( biscaler = BiScaler(
scale_rows=args.normalize_rows, scale_rows=args.normalize_rows,
...@@ -248,15 +318,12 @@ if __name__ == "__main__": ...@@ -248,15 +318,12 @@ if __name__ == "__main__":
X_completed = np.zeros_like(X) X_completed = np.zeros_like(X)
X_completed[ok_mesh] = X_completed_reduced X_completed[ok_mesh] = X_completed_reduced
y_pred = X_completed[test_coords] y_pred = X_completed[test_coords]
mae, tau, auc, f1_score = evaluate_predictions( mae, tau, auc, f1_score = evaluate_predictions(
y_true=y_true, y_pred=y_pred, max_ic50=args.max_ic50) y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50)
scores.add_many( scores.add_many(
method_name, method_name,
mae=mae, mae=mae,
tau=tau, tau=tau,
f1_score=f1_score, f1_score=f1_score,
auc=auc) auc=auc)
scores.to_csv(args.output_file) scores.to_csv(args.output_file)
...@@ -156,7 +156,8 @@ def load_allele_dicts( ...@@ -156,7 +156,8 @@ def load_allele_dicts(
peptide_column_name=None, peptide_column_name=None,
peptide_length_column_name="peptide_length", peptide_length_column_name="peptide_length",
ic50_column_name="meas", ic50_column_name="meas",
only_human=True): only_human=True,
min_allele_size=1):
""" """
Parsing CSV of binding data into dictionary of dictionaries. Parsing CSV of binding data into dictionary of dictionaries.
The outer key is an allele name, the inner key is a peptide sequence, The outer key is an allele name, the inner key is a peptide sequence,
...@@ -187,6 +188,7 @@ def load_allele_dicts( ...@@ -187,6 +188,7 @@ def load_allele_dicts(
} }
for (allele_name, group) for (allele_name, group)
in binding_df.groupby(allele_column_name) in binding_df.groupby(allele_column_name)
if len(group) >= min_allele_size
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment