From 9e3c4c8b3522a3f4a3143bac3d7f39b001622404 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 6 May 2016 17:00:01 -0400
Subject: [PATCH] moving encoding to dataset

---
 mhcflurry/common.py          |  15 +++
 mhcflurry/data.py            | 223 +----------------------------------
 mhcflurry/dataset.py         |  27 ++++-
 mhcflurry/dataset_helpers.py | 213 +++++++++++++++++++++++++++++++--
 4 files changed, 243 insertions(+), 235 deletions(-)

diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index 5fd34782..0849f238 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -17,6 +17,7 @@ from __future__ import (
     division,
     absolute_import,
 )
+from math import exp, log
 
 def parse_int_list(s):
     return [int(part.strip() for part in s.split(","))]
@@ -64,3 +65,17 @@ def split_allele_names(s):
         for part
         in s.split(",")
     ]
+
+
+def geometric_mean(xs, weights=None):
+    """
+    Geometric mean of a collection of affinity measurements, with optional
+    sample weights.
+    """
+    if len(xs) == 1:
+        return xs[0]
+    elif weights is None:
+        return exp(sum(log(xi) for xi in xs))
+    sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
+    denom = sum(weights)
+    return exp(sum_weighted_log / denom)
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index 136449af..4792ce04 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -22,14 +22,13 @@ from collections import namedtuple, defaultdict
 import pandas as pd
 import numpy as np
 
-from .common import normalize_allele_name
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
     indices_to_hotshot_encoding,
     fixed_length_index_encoding,
     check_valid_index_encoding_array,
 )
-from .regression_target import MAX_IC50, ic50_to_regression_target
+
 
 index_encoding = common_amino_acids.index_encoding
 
@@ -50,225 +49,8 @@ AlleleData = namedtuple(
     ])
 
 
-def _infer_csv_separator(filename):
-    """
-    Determine if file is separated by comma, tab, or whitespace.
-    Default to whitespace if the others are not detected.
-
-    Returns (sep, delim_whitespace)
-    """
-    for candidate in [",", "\t"]:
-        with open(filename, "r") as f:
-            for line in f:
-                if line.startswith("#"):
-                    continue
-                if candidate in line:
-                    return candidate, False
-    return None, True
-
-
-def load_dataframe(
-        filename,
-        max_ic50=MAX_IC50,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        filter_peptide_length=None,
-        ic50_column_name="meas",
-        only_human=False):
-    """
-    Load a dataframe of peptide-MHC affinity measurements
-
-    filename : str
-        TSV filename with columns:
-            - 'species'
-            - 'mhc'
-            - 'peptide_length'
-            - 'sequence'
-            - 'meas'
-
-    max_ic50 : float
-        Treat IC50 scores above this value as all equally bad
-        (transform them to 0.0 in the regression output)
-
-    sep : str, optional
-        Separator in CSV file, default is to let Pandas infer
-
-    peptide_column_name : str, optional
-        Default behavior is to try  {"sequence", "peptide", "peptide_sequence"}
-
-    filter_peptide_length : int, optional
-        Which length peptides to use (default=load all lengths)
-
-    only_human : bool
-        Only load entries from human MHC alleles
-
-    Returns DataFrame augmented with extra column:
-        - "regression_output" : 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
-    """
-    if sep is None:
-        sep, delim_whitespace = _infer_csv_separator(filename)
-    else:
-        delim_whitespace = False
-    df = pd.read_csv(
-        filename,
-        sep=sep,
-        delim_whitespace=delim_whitespace,
-        engine="c")
-    # hack: get around variability of column naming by checking if
-    # the peptide_column_name is actually present and if not try "peptide"
-    if peptide_column_name is None:
-        columns = set(df.keys())
-        for candidate in ["sequence", "peptide", "peptide_sequence"]:
-            if candidate in columns:
-                peptide_column_name = candidate
-                break
-        if peptide_column_name is None:
-            raise ValueError(
-                "Couldn't find peptide column name, candidates: %s" % (
-                    columns))
-    if only_human:
-        human_mask = df[species_column_name] == "human"
-        df = df[human_mask]
-
-    if filter_peptide_length:
-        length_mask = df[peptide_column_name].str.len() == filter_peptide_length
-        df = df[length_mask]
-
-    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
-    ic50 = np.array(df[ic50_column_name])
-    df["regression_output"] = ic50_to_regression_target(ic50, max_ic50=max_ic50)
-    return df, peptide_column_name
-
-
-def load_allele_dicts(
-        filename,
-        max_ic50=MAX_IC50,
-        regression_output=False,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        ic50_column_name="meas",
-        only_human=False,
-        min_allele_size=1):
-    """
-    Parsing CSV of binding data into dictionary of dictionaries.
-    The outer key is an allele name, the inner key is a peptide sequence,
-    and the inner value is an IC50 or log-transformed value between [0,1]
-    """
-    binding_df, peptide_column_name = load_dataframe(
-        filename=filename,
-        max_ic50=max_ic50,
-        sep=sep,
-        species_column_name=species_column_name,
-        allele_column_name=allele_column_name,
-        peptide_column_name=peptide_column_name,
-        ic50_column_name=ic50_column_name,
-        only_human=only_human)
-    # map peptides to either the raw IC50 or rescaled log IC50 depending
-    # on the `regression_output` parameter
-    output_column_name = (
-        "regression_output"
-        if regression_output
-        else ic50_column_name
-    )
-    return {
-        allele_name: {
-            peptide: output
-            for (peptide, output)
-            in zip(group[peptide_column_name], group[output_column_name])
-        }
-        for (allele_name, group)
-        in binding_df.groupby(allele_column_name)
-        if len(group) >= min_allele_size
-    }
-
 
-def encode_peptide_to_affinity_dict(
-        peptide_to_affinity_dict,
-        peptide_length=9,
-        flatten_binary_encoding=True,
-        allow_unknown_amino_acids=True):
-    """
-    Given a dictionary mapping from peptide sequences to affinity values, return
-    both index and binary encodings of fixed length peptides, and
-    a vector of their affinities.
 
-    Parameters
-    ----------
-    peptide_to_affinity_dict : dict
-        Keys are peptide strings (of multiple lengths), each mapping to a
-        continuous affinity value.
-
-    peptide_length : int
-        Length of vector encoding
-
-    flatten_binary_encoding : bool
-        Should the binary encoding of a peptide be two-dimensional (9x20)
-        or a flattened 1d vector
-
-    allow_unknown_amino_acids : bool
-        When extending a short vector to the desired peptide length, should
-        we insert every possible amino acid or a designated character "X"
-        indicating an unknown amino acid.
-
-    Returns tuple with the following fields:
-        - kmer_peptides: fixed length peptide strings
-        - original_peptides: variable length peptide strings
-        - counts: how many fixed length peptides were made from this original
-        - X_index: index encoding of fixed length peptides
-        - X_binary: binary encoding of fixed length peptides
-        - Y: affinity values associated with original peptides
-    """
-    raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
-    X_index, kmer_peptides, original_peptides, counts = \
-        fixed_length_index_encoding(
-            peptides=raw_peptides,
-            desired_length=peptide_length,
-            start_offset_shorten=0,
-            end_offset_shorten=0,
-            start_offset_extend=0,
-            end_offset_extend=0,
-            allow_unknown_amino_acids=allow_unknown_amino_acids)
-
-    n_samples = len(kmer_peptides)
-
-    assert n_samples == len(original_peptides), \
-        "Mismatch between # of samples (%d) and # of peptides (%d)" % (
-            n_samples, len(original_peptides))
-    assert n_samples == len(counts), \
-        "Mismatch between # of samples (%d) and # of counts (%d)" % (
-            n_samples, len(counts))
-    assert n_samples == len(X_index), \
-        "Mismatch between # of sample (%d) and index feature vectors (%d)" % (
-            n_samples, len(X_index))
-    X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
-    n_indices = 20 + allow_unknown_amino_acids
-    X_binary = indices_to_hotshot_encoding(
-        X_index,
-        n_indices=n_indices)
-
-    assert X_binary.shape[0] == X_index.shape[0], \
-        ("Mismatch between number of samples for index encoding (%d)"
-         " vs. binary encoding (%d)") % (
-            X_binary.shape[0],
-            X_index.shape[0])
-
-    if flatten_binary_encoding:
-        # collapse 3D input into 2D matrix
-        n_binary_features = peptide_length * n_indices
-        X_binary = X_binary.reshape((n_samples, n_binary_features))
-
-    # easier to work with counts when they're an array instead of list
-    counts = np.array(counts)
-
-    Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
-    assert n_samples == len(Y), \
-        "Mismatch between # peptides %d and # regression outputs %d" % (
-            n_samples, len(Y))
-    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
 
 def create_allele_data_from_peptide_to_ic50_dict(
         peptide_to_ic50_dict,
@@ -337,8 +119,7 @@ def load_allele_datasets(
         peptide_column_name=None,
         peptide_length_column_name="peptide_length",
         ic50_column_name="meas",
-        only_human=False,
-        shuffle=True):
+        only_human=False):
     """
     Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
     and log-transforms the IC50 measurement. Returns dictionary mapping allele
diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py
index b8f08bfb..9201053f 100644
--- a/mhcflurry/dataset.py
+++ b/mhcflurry/dataset.py
@@ -23,7 +23,11 @@ from six import string_types
 import pandas as pd
 import numpy as np
 
-from .dataset_helpers import prepare_pMHC_affinity_arrays, geometric_mean
+from .common import geometric_mean
+from .dataset_helpers import (
+    prepare_pMHC_affinity_arrays,
+    load_dataframe
+)
 
 class Dataset(object):
 
@@ -250,6 +254,27 @@ class Dataset(object):
         return cls.from_allele_to_peptide_to_affinity_dictionary(
             {allele_name: peptide_to_affinity_dict})
 
+    @classmethod
+    def from_csv(
+            cls,
+            filename,
+            sep=None,
+            allele_column_name=None,
+            peptide_column_name=None,
+            affinity_column_name=None):
+        df, allele_column_name, peptide_column_name, affinity_column_name = \
+            load_dataframe(
+                filename=filename,
+                sep=sep,
+                allele_column_name=allele_column_name,
+                peptide_column_name=peptide_column_name,
+                affinity_column_name=affinity_column_name)
+        df = df.rename(columns={
+            allele_column_name: "allele",
+            peptide_column_name: "peptide",
+            affinity_column_name: "affinity"})
+        return cls(df)
+
     def get_allele(self, allele_name):
         """
         Get Dataset for a single allele
diff --git a/mhcflurry/dataset_helpers.py b/mhcflurry/dataset_helpers.py
index a16a61c0..379b62d5 100644
--- a/mhcflurry/dataset_helpers.py
+++ b/mhcflurry/dataset_helpers.py
@@ -20,20 +20,14 @@ from __future__ import (
 
 from typechecks import require_instance
 import numpy as np
-from math import exp, log
+import pandas as pd
 
-def geometric_mean(xs, weights=None):
-    """
-    Geometric mean of a collection of affinity measurements, with optional
-    sample weights.
-    """
-    if len(xs) == 1:
-        return xs[0]
-    elif weights is None:
-        return exp(sum(log(xi) for xi in xs))
-    sum_weighted_log = sum(log(xi) * wi for (xi, wi) in zip(xs, weights))
-    denom = sum(weights)
-    return exp(sum_weighted_log / denom)
+from .common import normalize_allele_name
+from .peptide_encoding import (
+    indices_to_hotshot_encoding,
+    fixed_length_index_encoding,
+    check_valid_index_encoding_array,
+)
 
 def check_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights):
     """
@@ -82,3 +76,196 @@ def prepare_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights=N
         affinities=affinities,
         sample_weights=sample_weights)
     return alleles, peptides, affinities, sample_weights
+
+
+def infer_csv_separator(filename):
+    """
+    Determine if file is separated by comma, tab, or whitespace.
+    Default to whitespace if the others are not detected.
+
+    Returns (sep, delim_whitespace)
+    """
+    for candidate in [",", "\t"]:
+        with open(filename, "r") as f:
+            for line in f:
+                if line.startswith("#"):
+                    continue
+                if candidate in line:
+                    return candidate, False
+    return None, True
+
+def load_dataframe(
+        filename,
+        sep=None,
+        allele_column_name=None,
+        peptide_column_name=None,
+        affinity_column_name=None,
+        filter_peptide_length=None,
+        normalize_allele_names=True):
+    """
+    Load a dataframe of peptide-MHC affinity measurements
+
+    filename : str
+        TSV filename with columns:
+            - 'species'
+            - 'mhc'
+            - 'peptide_length'
+            - 'sequence'
+            - 'meas'
+
+    sep : str, optional
+        Separator in CSV file, default is to let Pandas infer
+
+    allele_column_name : str, optional
+        Default behavior is to try {"mhc", "allele", "hla"}
+
+    peptide_column_name : str, optional
+        Default behavior is to try  {"sequence", "peptide", "peptide_sequence"}
+
+    affinity_column_name : str, optional
+        Default behavior is to try {"meas", "ic50", "affinity", "aff"}
+
+    filter_peptide_length : int, optional
+        Which length peptides to use (default=load all lengths)
+
+    normalize_allele_names : bool
+        Normalize MHC names or leave them alone
+
+    Returns:
+        - DataFrame
+        - peptide column name
+        - allele column name
+        - affinity column name
+    """
+    if sep is None:
+        sep, delim_whitespace = infer_csv_separator(filename)
+    else:
+        delim_whitespace = False
+
+    df = pd.read_csv(
+        filename,
+        sep=sep,
+        delim_whitespace=delim_whitespace,
+        engine="c")
+
+    columns = set(df.keys())
+
+    if allele_column_name is None:
+        for candidate in ["mhc", "allele", "hla"]:
+            if candidate in columns:
+                allele_column_name = candidate
+                break
+        if allele_column_name is None:
+            raise ValueError(
+                "Couldn't find alleles, available columns: %s" % (
+                    columns,))
+
+    if peptide_column_name is None:
+        for candidate in ["sequence", "peptide", "peptide_sequence"]:
+            if candidate in columns:
+                peptide_column_name = candidate
+                break
+        if peptide_column_name is None:
+            raise ValueError(
+                "Couldn't find peptides, available columns: %s" % (
+                    columns,))
+
+    if affinity_column_name is None:
+        for candidate in ["meas", "ic50", "affinity"]:
+            if candidate in columns:
+                affinity_column_name = candidate
+                break
+        if affinity_column_name is None:
+            raise ValueError(
+                "Couldn't find affinity values, available columns: %s" % (
+                    columns,))
+    if filter_peptide_length:
+        length_mask = df[peptide_column_name].str.len() == filter_peptide_length
+        df = df[length_mask]
+    df[allele_column_name] = df[allele_column_name].map(normalize_allele_name)
+    return df, allele_column_name, peptide_column_name, affinity_column_name
+
+
+def encode_peptide_to_affinity_dict(
+        peptide_to_affinity_dict,
+        peptide_length=9,
+        flatten_binary_encoding=True,
+        allow_unknown_amino_acids=True):
+    """
+    Given a dictionary mapping from peptide sequences to affinity values, return
+    both index and binary encodings of fixed length peptides, and
+    a vector of their affinities.
+
+    Parameters
+    ----------
+    peptide_to_affinity_dict : dict
+        Keys are peptide strings (of multiple lengths), each mapping to a
+        continuous affinity value.
+
+    peptide_length : int
+        Length of vector encoding
+
+    flatten_binary_encoding : bool
+        Should the binary encoding of a peptide be two-dimensional (9x20)
+        or a flattened 1d vector
+
+    allow_unknown_amino_acids : bool
+        When extending a short vector to the desired peptide length, should
+        we insert every possible amino acid or a designated character "X"
+        indicating an unknown amino acid.
+
+    Returns tuple with the following fields:
+        - kmer_peptides: fixed length peptide strings
+        - original_peptides: variable length peptide strings
+        - counts: how many fixed length peptides were made from this original
+        - X_index: index encoding of fixed length peptides
+        - X_binary: binary encoding of fixed length peptides
+        - Y: affinity values associated with original peptides
+    """
+    raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
+    X_index, kmer_peptides, original_peptides, counts = \
+        fixed_length_index_encoding(
+            peptides=raw_peptides,
+            desired_length=peptide_length,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            start_offset_extend=0,
+            end_offset_extend=0,
+            allow_unknown_amino_acids=allow_unknown_amino_acids)
+
+    n_samples = len(kmer_peptides)
+
+    assert n_samples == len(original_peptides), \
+        "Mismatch between # of samples (%d) and # of peptides (%d)" % (
+            n_samples, len(original_peptides))
+    assert n_samples == len(counts), \
+        "Mismatch between # of samples (%d) and # of counts (%d)" % (
+            n_samples, len(counts))
+    assert n_samples == len(X_index), \
+        "Mismatch between # of sample (%d) and index feature vectors (%d)" % (
+            n_samples, len(X_index))
+    X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
+    n_indices = 20 + allow_unknown_amino_acids
+    X_binary = indices_to_hotshot_encoding(
+        X_index,
+        n_indices=n_indices)
+
+    assert X_binary.shape[0] == X_index.shape[0], \
+        ("Mismatch between number of samples for index encoding (%d)"
+         " vs. binary encoding (%d)") % (
+            X_binary.shape[0],
+            X_index.shape[0])
+
+    if flatten_binary_encoding:
+        # collapse 3D input into 2D matrix
+        n_binary_features = peptide_length * n_indices
+        X_binary = X_binary.reshape((n_samples, n_binary_features))
+
+    # easier to work with counts when they're an array instead of list
+    counts = np.array(counts)
+
+    Y = np.array([peptide_to_affinity_dict[p] for p in original_peptides])
+    assert n_samples == len(Y), \
+        "Mismatch between # peptides %d and # regression outputs %d" % (
+            n_samples, len(Y))
+    return (kmer_peptides, original_peptides, counts, X_index, X_binary, Y)
-- 
GitLab