""" Make training data by selecting decoys, etc. """ import sys import argparse import os import numpy import pandas import tqdm import mhcflurry parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "--hits", metavar="CSV", required=True, help="Multiallelic mass spec") parser.add_argument( "--predictions", metavar="CSV", required=True, help="Predictions data") parser.add_argument( "--proteome-peptides", metavar="CSV", required=True, help="Proteome peptides") parser.add_argument( "--hit-multiplier-to-take", type=float, default=1, help="") parser.add_argument( "--ppv-multiplier", type=int, metavar="N", default=1000, help="Take top 1/N predictions.") parser.add_argument( "--exclude-contig", help="Exclude entries annotated to the given contig") parser.add_argument( "--out", metavar="CSV", required=True, help="File to write") parser.add_argument( "--alleles", nargs="+", help="Include only the specified alleles") def load_predictions(dirname, result_df=None, columns=None): peptides = pandas.read_csv( os.path.join(dirname, "peptides.csv")).peptide manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv")) print( "Loading results. Existing data has", len(peptides), "peptides and", len(manifest_df), "columns") if columns is None: columns = manifest_df.col.values if result_df is None: result_df = pandas.DataFrame( index=peptides, columns=columns, dtype="float32") result_df[:] = numpy.nan peptides_to_assign = peptides mask = None else: mask = (peptides.isin(result_df.index)).values peptides_to_assign = peptides[mask] manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)] for _, row in manifest_df.iterrows(): with open(os.path.join(dirname, row.path), "rb") as fd: value = numpy.load(fd)['arr_0'] if mask is not None: value = value[mask] result_df.loc[peptides_to_assign, row.col] = value return result_df def run(): args = parser.parse_args(sys.argv[1:]) hit_df = pandas.read_csv(args.hits) numpy.testing.assert_equal(hit_df.hit_id.nunique(), len(hit_df)) hit_df = hit_df.loc[ (hit_df.mhc_class == "I") & (hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8) & (~hit_df.protein_ensembl.isnull()) & (hit_df.peptide.str.match("^[%s]+$" % "".join( mhcflurry.amino_acid.COMMON_AMINO_ACIDS))) ] print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) hit_df = hit_df.loc[hit_df.format == "MONOALLELIC"].copy() print("Subselected to %d monoallelic samples" % hit_df.sample_id.nunique()) hit_df["allele"] = hit_df.hla hit_df = hit_df.loc[hit_df.allele.str.match("^HLA-[ABC]")] print("Subselected to %d HLA-A/B/C samples" % hit_df.sample_id.nunique()) if args.exclude_contig: new_hit_df = hit_df.loc[ hit_df.protein_primary_ensembl_contig.astype(str) != args.exclude_contig ] print( "Excluding contig", args.exclude_contig, "reduced dataset from", len(hit_df), "to", len(new_hit_df)) hit_df = new_hit_df.copy() if args.alleles: filter_alleles = set(args.alleles) new_hit_df = hit_df.loc[ hit_df.allele.isin(filter_alleles) ] print( "Selecting alleles", args.alleles, "reduced dataset from", len(hit_df), "to", len(new_hit_df)) hit_df = new_hit_df.copy() sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") grouped = hit_df.groupby("sample_id").nunique() for col in sample_table.columns: if (grouped[col] > 1).any(): del sample_table[col] sample_table["total_hits"] = hit_df.groupby("sample_id").peptide.nunique() print("Loading proteome peptides") all_peptides_df = pandas.read_csv(args.proteome_peptides) print("Loaded: ", all_peptides_df.shape) all_peptides_df = all_peptides_df.loc[ all_peptides_df.protein_accession.isin(hit_df.protein_accession.unique()) & all_peptides_df.peptide.str.match("^[%s]+$" % "".join( mhcflurry.amino_acid.COMMON_AMINO_ACIDS)) ].copy() all_peptides_df["length"] = all_peptides_df.peptide.str.len() print("Subselected proteome peptides by accession: ", all_peptides_df.shape) all_peptides_by_length = dict(iter(all_peptides_df.groupby("length"))) columns_to_keep = [ "hit_id", "protein_accession", "n_flank", "c_flank", "peptide", "sample_id", "affinity_prediction", "hit", ] print("Selecting decoys.") lengths = [8, 9, 10, 11] result_df = [] for sample_id, sub_hit_df in tqdm.tqdm( hit_df.groupby("sample_id"), total=hit_df.sample_id.nunique()): sub_hit_df = sub_hit_df.copy() sub_hit_df["hit"] = 1 decoys_df = [] for length in lengths: universe = all_peptides_by_length[length] decoys_df.append( universe.loc[ (~universe.peptide.isin(sub_hit_df.peptide.unique())) & (universe.protein_accession.isin(sub_hit_df.protein_accession.unique())) ].sample( n=int(len(sub_hit_df) * args.ppv_multiplier / len(lengths)))[[ "protein_accession", "peptide", "n_flank", "c_flank" ]].drop_duplicates("peptide")) merged_df = pandas.concat( [sub_hit_df] + decoys_df, ignore_index=True, sort=False) prediction_col = "%s affinity" % sample_table.loc[sample_id].hla predictions_df = pandas.DataFrame( index=merged_df.peptide.unique(), columns=[prediction_col]) load_predictions(args.predictions, result_df=predictions_df) merged_df["affinity_prediction"] = merged_df.peptide.map( predictions_df[prediction_col]) merged_df = merged_df.sort_values("affinity_prediction", ascending=True) num_to_take = int(len(sub_hit_df) * args.hit_multiplier_to_take) selected_df = merged_df.head(num_to_take)[ columns_to_keep ].sample(frac=1.0).copy() selected_df["hit"] = selected_df["hit"].fillna(0) selected_df["sample_id"] = sample_id result_df.append(selected_df) print( "Processed sample", sample_id, "with hit and decoys:\n", selected_df.hit.value_counts()) result_df = pandas.concat(result_df, ignore_index=True, sort=False) result_df["hla"] = result_df.sample_id.map(sample_table.hla) print(result_df) print("Counts:") print(result_df.groupby(["sample_id", "hit"]).peptide.nunique()) print("Hit counts:") print( result_df.loc[ result_df.hit == 1 ].groupby("sample_id").hit.count().sort_values()) print("Hit rates:") print(result_df.groupby("sample_id").hit.mean().sort_values()) result_df.to_csv(args.out, index=False) print("Wrote: ", args.out) if __name__ == '__main__': run()