Newer
Older
from sklearn.model_selection import StratifiedKFold
import pandas
Tim O'Donnell
committed
from .affinity_measurement_dataset import AffinityMeasurementDataset
from .imputation_helpers import imputer_from_name
COLUMNS = [
"allele",
"peptide",
"measurement_type",
"measurement_source",
"measurement_value",
"weight",
]
MEASUREMENT_TYPES = [
"affinity",
"ms_hit",
]
MEASUREMENT_SOURCES = [
"in_vitro_affinity_assay",
"imputed",
"ms_hit",
"ms_decoy",
]
class MeasurementCollection(object):
"""
A measurement collection is a set of observations for allele/peptide pairs.
A single measurement collection may have both MS hits and affinity measurements.
Tim O'Donnell
committed
This is more general than a AffinityMeasurementDataset since it supports MS hits. It is also
simpler, as the user is expected to manipulate the underlying dataframe.
Tim O'Donnell
committed
Later we may want to retire AffinityMeasurementDataset or combine it with this class.
def __init__(self, df, check=True):
if check:
for col in COLUMNS:
assert col in df.columns, col
for measurement_type in df.measurement_type.unique():
assert measurement_type in MEASUREMENT_TYPES, measurement_type
self.df = df[COLUMNS]
@staticmethod
def from_dataset(dataset):
Tim O'Donnell
committed
Given a AffinityMeasurementDataset, return a MeasurementCollection
dataset_df = dataset.to_dataframe()
df = dataset_df.reset_index(drop=True)[["allele", "peptide"]].copy()
df["measurement_type"] = "affinity"
df["measurement_source"] = "in_vitro_affinity_assay"
df["measurement_value"] = dataset_df.affinity.values
df["weight"] = dataset_df.sample_weight.values
return MeasurementCollection(df)
def select_measurement_type(self, kind):
"""
Return a new MeasurementCollection containing only measurements of the
given type.
Parameters
-----------
kind : string
"affinity" or "ms_hit"
Returns
-----------
MeasurementCollection instance
"""
if kind not in MEASUREMENT_TYPES:
raise ValueError(
"Unknown measurement type: %s. Supported types: %s" % (
kind, ", ".join(MEASUREMENT_TYPES)))
return MeasurementCollection(
self.df.ix[self.df.measurement_type == kind],
check=False)
"""
Return a new MeasurementCollection containing only observations for the
specified allele.
"""
assert isinstance(allele, str), type(allele)
assert len(self.df) > 0
alleles = set(self.df.allele.unique())
assert allele in alleles, "%s not in %s" % (allele, alleles)
return MeasurementCollection(
self.df.ix[self.df.allele == allele],
check=False)
def half_splits(self, num, random_state=None):
"""
Split the MeasurementCollection into disjoint pairs of
MeasurementCollection instances, each containing half the observations.
Parameters
-------------
num : int
Number of pairs to return
random_state : int, optional
Returns
-------------
list of (MeasurementCollection, MeasurementCollection) pairs
Each pair gives a disjoint train and test split.
"""
assert num > 0
results = []
while True:
cv = StratifiedKFold(
n_splits=2,
shuffle=True,
random_state=(
None if random_state is None
else random_state + len(results)))
stratification_groups = self.df.allele + self.df.measurement_type
#assert len(stratification_groups.unique()) > 1, (
# stratification_groups.unique())
cv.split(self.df.values, stratification_groups))
assert len(indices1) > 0
assert len(indices2) > 0
mc1 = MeasurementCollection(self.df.iloc[indices1], check=False)
mc2 = MeasurementCollection(self.df.iloc[indices2], check=False)
for pair in [(mc1, mc2), (mc2, mc1)]:
results.append(pair)
if len(results) == num:
return results
def to_dataset(
self,
include_ms=False,
ms_hit_affinity=1.0,
ms_decoy_affinity=20000):
Tim O'Donnell
committed
Return a AffinityMeasurementDataset containing the observations in the collection.
Mass-spec data are converted to affinities according to
ms_hit_affinity and ms_decoy_affinity.
Parameters
-------------
include_ms : bool
If True then mass spec data is included; otherwise it is dropped
ms_hit_affinity : float
nM affinity to assign to mass-spec hits (relevant only if
include_ms=True)
ms_decoy_affinity : float
nM affinity to assign to mass-spec decoys (relevant only if
include_ms=True)
Returns
-------------
Tim O'Donnell
committed
AffinityMeasurementDataset instance
Tim O'Donnell
committed
dataset = AffinityMeasurementDataset(pandas.DataFrame({
"allele": self.df.allele,
"peptide": self.df.peptide,
"affinity": [
row.measurement_value if row.measurement_type == "affinity"
else (
ms_hit_affinity if row.value > 0
else ms_decoy_affinity)
for (_, row) in self.df.iterrows()
],
"sample_weight": self.df.weight,
}))
else:
df = self.df.ix[
(self.df.measurement_type == "affinity") &
(self.df.measurement_source == "in_vitro_affinity_assay")
]
Tim O'Donnell
committed
dataset = AffinityMeasurementDataset(pandas.DataFrame({
"allele": df.allele,
"peptide": df.peptide,
"affinity": df.measurement_value,
"sample_weight": df.weight,
}))
return dataset
def impute(
self,
impute_method="mice",
impute_log_transform=True,
impute_min_observations_per_peptide=1,
impute_min_observations_per_allele=1,
imputer_args={}):
"""
Return a new MeasurementCollection after applying imputation to
this collection. The imputed collection will have the
observations in the current collection plus the imputed data.
"""
assert len(self.df) > 0
dataset = self.to_dataset(include_ms=False)
imputer = imputer_from_name(impute_method, **imputer_args)
result_df = dataset.impute_missing_values(
log_transform=impute_log_transform,
min_observations_per_peptide=impute_min_observations_per_peptide,
min_observations_per_allele=impute_min_observations_per_allele
result_df["measurement_type"] = "affinity"
result_df["measurement_source"] = "imputed"
result_df["measurement_value"] = result_df.affinity
result_df["weight"] = result_df.sample_weight
return MeasurementCollection(result_df)