New dataset generation strategy for cleavage predictorr

face9647 · Tim O'Donnell · bfaeaea9 · face9647 · face9647
Commit face9647 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_cleavage/GENERATE.sh
+++ b/downloads-generation/models_class1_cleavage/GENERATE.sh
@@ -94,7 +94,8 @@ else
        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
        --predictions "$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.mhcflurry.combined" \
        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
-        --decoys-per-hit 2 \
+        --ppv-multiplier 100 \
+        --hit-multiplier-to-take 1 \
        --out "$(pwd)/train_data.csv"
    bzip2 -f train_data.csv
 fi

--- a/downloads-generation/models_class1_cleavage/make_train_data.py
+++ b/downloads-generation/models_class1_cleavage/make_train_data.py
@@ -29,9 +29,9 @@ parser.add_argument(
    required=True,
    help="Proteome peptides")
 parser.add_argument(
-    "--decoys-per-hit",
-    type=int,
-    default=2,
+    "--hit-multiplier-to-take",
+    type=float,
+    default=1,
    help="")
 parser.add_argument(
    "--ppv-multiplier",
@@ -190,35 +190,26 @@ def run():
                        "protein_accession", "peptide", "n_flank", "c_flank"
                ]].drop_duplicates("peptide"))

-        decoys_df = pandas.concat(decoys_df, ignore_index=True, sort=False)
+        merged_df = pandas.concat(
+            [sub_hit_df] + decoys_df, ignore_index=True, sort=False)

        prediction_col = "%s affinity" % sample_table.loc[sample_id].hla
        predictions_df = pandas.DataFrame(
-            index=numpy.concatenate([
-                sub_hit_df.peptide.unique(),
-                decoys_df.peptide.unique()
-            ]),
+            index=merged_df.peptide.unique(),
            columns=[prediction_col])
        load_predictions(args.predictions, result_df=predictions_df)

-        sub_hit_df["affinity_prediction"] = sub_hit_df.peptide.map(
-            predictions_df[prediction_col])
-        decoys_df["affinity_prediction"] = decoys_df.peptide.map(
+        merged_df["affinity_prediction"] = merged_df.peptide.map(
            predictions_df[prediction_col])
-
-        decoys_df = decoys_df.sort_values("affinity_prediction", ascending=True)
-
-        sub_decoys_df = decoys_df.head(
-            len(sub_hit_df) * args.decoys_per_hit).copy()
-        sub_decoys_df["hit"] = 0
-        sub_decoys_df["sample_id"] = sample_id
-
-        sample_result_df = pandas.concat(
-            [sub_hit_df, sub_decoys_df],
-            ignore_index=True,
-            sort=False)[columns_to_keep].sample(frac=1.0)
-
-        result_df.append(sample_result_df)
+        merged_df = merged_df.sort_values("affinity_prediction", ascending=True)
+
+        num_to_take = len(sub_hit_df) * args.hit_multiplier_to_take
+        selected_df = merged_df.head(num_to_take)[
+                columns_to_keep
+        ].sample(frac=1.0).copy()
+        selected_df["hit"] = selected_df["hit"].fillna(0)
+        selected_df["sample_id"] = sample_id
+        result_df.append(selected_df)

    result_df = pandas.concat(result_df, ignore_index=True, sort=False)
    result_df["hla"] = result_df.sample_id.map(sample_table.hla)