common.py

# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import (
    print_function,
    division,
    absolute_import,
)
from .amino_acid import amino_acid_letters


def parse_int_list(s):
    return [int(part.strip() for part in s.split(","))]


def split_uppercase_sequences(s):
    return [part.strip().upper() for part in s.split(",")]


def normalize_allele_name(allele_name):
    """
    Only works for mouse, human, and rhesus monkey alleles.

    TODO: use the same logic as mhctools for MHC name parsing.
    Possibly even worth its own small repo called something like "mhcnames"
    """
    allele_name = allele_name.upper()
    if allele_name.startswith("MAMU"):
        prefix = "Mamu-"
    elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
        prefix = "H-2-"
    else:
        prefix = ""
    # old school HLA-C serotypes look like "Cw"
    allele_name = allele_name.replace("CW", "C")
    patterns = [
        "HLA-",
        "H-2",
        "H2",
        "MAMU",
        "-",
        "*",
        ":"
    ]
    for pattern in patterns:
        allele_name = allele_name.replace(pattern, "")
    return "%s%s" % (prefix, allele_name)


def split_allele_names(s):
    return [
        normalize_allele_name(part.strip())
        for part
        in s.split(",")
    ]


def expand_9mer_peptides(peptides, length):
    """
    Expand non-9mer peptides using methods from
       Accurate approximation method for prediction of class I MHC
       affinities for peptides of length 8, 10 and 11 using prediction
       tools trained on 9mers.
    by Lundegaard et. al.
    http://bioinformatics.oxfordjournals.org/content/24/11/1397
    """
    assert len(peptides) > 0
    if length < 8:
        raise ValueError("Invalid peptide length: %d (%s)" % (
            length, peptides[0]))
    elif length == 9:
        return peptides
    elif length == 8:
        # extend each peptide by inserting every possible amino acid
        # between base-1 positions 4-8
        return [
            peptide[:i] + extra_amino_acid + peptide[i:]
            for peptide in peptides
            for i in range(3, 8)
            for extra_amino_acid in amino_acid_letters
        ]
    else:
        # drop interior residues between base-1 positions 4 to last
        n_skip = length - 9
        return [
            peptide[:i] + peptide[i + n_skip:]
            for peptide in peptides
            for i in range(3, 9)
        ]