Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy
import pandas
class PercentRankTransform(object):
"""
Transform arbitrary values into percent ranks.
"""
def __init__(self):
self.cdf = None
self.bin_edges = None
def fit(self, values, bins):
"""
Fit the transform using the given values, which are used to
establish percentiles.
"""
assert self.cdf is None
assert self.bin_edges is None
assert len(values) > 0
(hist, self.bin_edges) = numpy.histogram(values, bins=bins)
self.cdf = numpy.ones(len(hist) + 3) * numpy.nan
self.cdf[0] = 0.0
self.cdf[1] = 0.0
self.cdf[-1] = 100.0
numpy.cumsum(hist * 100.0 / numpy.sum(hist), out=self.cdf[2:-1])
assert not numpy.isnan(self.cdf).any()
def transform(self, values):
"""
Return percent ranks (range [0, 100]) for the given values.
"""
assert self.cdf is not None
assert self.bin_edges is not None
indices = numpy.searchsorted(self.bin_edges, values)
result = self.cdf[indices]
assert len(result) == len(values)
return result
def to_series(self):
"""
Serialize the fit to a pandas.Series.
The index on the series gives the bin edges and the valeus give the CDF.
Returns
-------
pandas.Series
"""
return pandas.Series(
self.cdf, index=[numpy.nan] + list(self.bin_edges) + [numpy.nan])
@staticmethod
def from_series(series):
"""
Deseralize a PercentRankTransform the given pandas.Series, as returned
by `to_series()`.
Parameters
----------
series : pandas.Series
Returns
-------
PercentRankTransform
"""
result = PercentRankTransform()
result.cdf = series.values
result.bin_edges = series.index.values[1:-1]
return result