From 384dab34b89fdf8baa0066b24c550911d2e8cf3b Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 20 Sep 2019 14:21:25 -0400
Subject: [PATCH] better docs

---
 docs/generate_class1_pan.py | 46 ++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py
index 080421fe..7e70cada 100644
--- a/docs/generate_class1_pan.py
+++ b/docs/generate_class1_pan.py
@@ -5,6 +5,7 @@ from __future__ import print_function
 import sys
 import argparse
 from collections import OrderedDict, defaultdict
+import os
 from os.path import join, exists
 from os import mkdir
 
@@ -35,7 +36,7 @@ parser.add_argument(
 )
 parser.add_argument(
     "--logo-cutoff",
-    default=0.01,
+    default=0.001,
     type=float,
     help="Fraction of top to use for motifs",
 )
@@ -46,10 +47,18 @@ parser.add_argument(
     help="Fraction of top to use for length distribution",
 )
 parser.add_argument(
-    "--lengths",
+    "--length-distribution-lengths",
+    nargs="+",
+    default=[8, 9, 10, 11, 12, 13, 14, 15],
+    type=int,
+    help="Peptide lengths for length distribution plots",
+)
+parser.add_argument(
+    "--motif-lengths",
+    nargs="+",
     default=[8, 9, 10, 11],
     type=int,
-    help="Peptide lengths",
+    help="Peptide lengths for motif plots",
 )
 parser.add_argument(
     "--out-dir",
@@ -59,6 +68,7 @@ parser.add_argument(
 )
 parser.add_argument(
     "--max-alleles",
+    default=None,
     type=int,
     metavar="N",
     help="Only use N alleles (for testing)",
@@ -70,6 +80,8 @@ def model_info(models_dir):
         join(models_dir, "length_distributions.csv.bz2"))
     frequency_matrices_df = pandas.read_csv(
         join(models_dir, "frequency_matrices.csv.bz2"))
+    train_data_df = pandas.read_csv(
+        join(models_dir, "train_data.csv.bz2"))
 
     distribution = frequency_matrices_df.loc[
         (frequency_matrices_df.cutoff_fraction == 1.0), AMINO_ACIDS
@@ -79,9 +91,13 @@ def model_info(models_dir):
     normalized_frequency_matrices.loc[:, AMINO_ACIDS] = (
             normalized_frequency_matrices[AMINO_ACIDS] / distribution)
 
+    observations_per_allele = (
+        train_data_df.groupby("allele").peptide.nunique().to_dict())
+
     return {
         'length_distributions': length_distributions_df,
         'normalized_frequency_matrices': normalized_frequency_matrices,
+        'observations_per_allele': observations_per_allele,
     }
 
 
@@ -114,9 +130,11 @@ def write_logo(
     )
     pyplot.title("%s %d-mer (%s)" % (allele, length, models_label))
     pyplot.xticks(matrix.index.values)
-    name = "%s_%dmer.%s.png" % (allele, length, models_label)
-    pyplot.savefig(join(out_dir, name))
-    print("Wrote: ", name)
+    name = "%s-%dmer.%s.png" % (
+        allele.replace("*", "-").replace(":", "-"), length, models_label)
+    filename = os.path.abspath(join(out_dir, name))
+    pyplot.savefig(filename)
+    print("Wrote: ", filename)
     fig.clear()
     pyplot.close(fig)
     return name
@@ -140,9 +158,12 @@ def write_length_distribution(
     pyplot.xlabel("")
     pyplot.xticks(rotation=0)
     pyplot.gca().get_legend().remove()
-    name = "%s.lengths.%s.png" % (allele, models_label)
-    pyplot.savefig(join(out_dir, name))
-    print("Wrote: ", name)
+    name = "%s.lengths.%s.png" % (
+        allele.replace("*", "-").replace(":", "-"), models_label)
+
+    filename = os.path.abspath(join(out_dir, name))
+    pyplot.savefig(filename)
+    print("Wrote: ", filename)
     fig.clear()
     pyplot.close(fig)
     return name
@@ -208,7 +229,7 @@ def go(argv):
             length_distribution_image_path = write_length_distribution(
                 length_distributions_df=length_distribution,
                 allele=allele,
-                lengths=args.lengths,
+                lengths=args.length_distribution_lengths,
                 cutoff=args.length_cutoff,
                 out_dir=args.out_dir,
                 models_label=label)
@@ -219,9 +240,12 @@ def go(argv):
                 "*" + (
                     "With mass-spec" if label == "with_mass_spec" else "Affinities only")
                 + "*\n")
+            w("Training observations (unique peptides): %d" % (
+                info['observations_per_allele'].get(allele, 0)))
+            w("\n")
             w(image(length_distribution_image_path))
 
-            for length in args.lengths:
+            for length in args.motif_lengths:
                 w(image(write_logo(
                     normalized_frequency_matrices=normalized_frequency_matrices,
                     allele=allele,
-- 
GitLab