Coverage for src/onorm/quantile.py: 100%
48 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 20:22 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 20:22 +0000
1"""
2Quantile-based normalization using TDigest for online CDF estimation.
3"""
5import json
6from typing import Any, Dict, List
8import numpy as np
9from fastdigest import TDigest
11from .normalization_base import Normalizer
14class QuantileTransformer(Normalizer):
15 r"""
16 Online quantile-based normalization using marginal CDF estimation.
18 Transforms features to follow a uniform distribution on [0, 1] by mapping
19 each value to its empirical cumulative distribution function (CDF) value.
20 Uses TDigest for efficient online quantile estimation without storing all
21 historical data.
23 For each feature $i$, the transformation is:
25 $$x_{\text{norm},i} = F_i(x_i)$$
27 where $F_i$ is the estimated cumulative distribution function for feature $i$.
29 Parameters
30 ----------
31 n_dim : int
32 Number of dimensions/features to normalize
33 max_centroids : int, default=1000
34 Maximum number of centroids for TDigest. Higher values increase precision
35 but use more memory. Typically 100-1000 is sufficient.
36 output_distribution : str, default='uniform'
37 Target output distribution:
38 - 'uniform': Output in [0, 1] (raw CDF values)
39 - 'normal': Apply inverse normal CDF to get standard normal output
41 Attributes
42 ----------
43 digests : List[TDigest]
44 List of TDigest objects for tracking marginal distributions per feature.
46 Examples
47 --------
48 ```{python}
49 from onorm import QuantileTransformer
50 import numpy as np
52 # Create transformer
53 qt = QuantileTransformer(n_dim=3)
55 # Fit on skewed data
56 X = np.random.exponential(scale=2.0, size=(1000, 3))
57 for x in X:
58 qt.partial_fit(x)
60 # Transform maps to uniform [0, 1]
61 x_new = np.array([0.5, 1.0, 5.0])
62 x_uniform = qt.transform(x_new.copy()) # Values close to 0, 0.4, 0.9
63 ```
65 Notes
66 -----
67 - Transforms arbitrary distributions to uniform [0, 1]
68 - Robust to outliers and heavy-tailed distributions
69 - TDigest provides approximate quantiles with bounded memory
70 - Transformation is monotonic within each feature
71 - Features are transformed independently (marginal CDFs)
72 - Values outside the observed range are clipped to [0, 1]
74 References
75 ----------
76 [Computing Extremely Accurate Quantiles Using t-Digests](https://arxiv.org/abs/1902.04023)
78 See Also
79 --------
80 Winsorizer : For robust outlier clipping at quantiles
81 StandardScaler : For Gaussian-based normalization
82 """
84 def __init__(
85 self,
86 n_dim: int,
87 max_centroids: int = 1000,
88 output_distribution: str = "uniform",
89 ) -> None:
90 if output_distribution not in ("uniform", "normal"):
91 raise ValueError(
92 f"output_distribution must be 'uniform' or 'normal', got '{output_distribution}'"
93 )
95 self.n_dim = n_dim
96 self.max_centroids = max_centroids
97 self.output_distribution = output_distribution
98 self.reset()
100 def partial_fit(self, x: np.ndarray) -> None:
101 """
102 Update CDF estimates for each feature.
104 Parameters
105 ----------
106 x : np.ndarray
107 A 1-D array of shape (n_dim,) representing a new observation.
108 """
109 for i, xi in enumerate(x):
110 self.digests[i].update(xi.item())
112 def transform(self, x: np.ndarray) -> np.ndarray:
113 """
114 Transform data to target distribution using estimated CDFs.
116 Parameters
117 ----------
118 x : np.ndarray
119 A 1-D array of shape (n_dim,) to transform.
121 Returns
122 -------
123 np.ndarray
124 Transformed array where each value is mapped to its CDF value.
125 If output_distribution='uniform': values in [0, 1]
126 If output_distribution='normal': values are standard normal
128 Notes
129 -----
130 Values smaller than all observed data receive CDF ≈ 0.
131 Values larger than all observed data receive CDF ≈ 1.
132 """
133 for i in range(self.n_dim):
134 # Get CDF value for this observation
135 cdf_value = self.digests[i].cdf(x[i].item())
137 # Clip to [0, 1] in case of numerical issues
138 cdf_value = np.clip(cdf_value, 0.0, 1.0)
140 if self.output_distribution == "uniform":
141 x[i] = cdf_value
142 else: # normal
143 # Apply inverse normal CDF (probit function)
144 # Handle edge cases to avoid inf
145 if cdf_value <= 0.0:
146 x[i] = -8.0 # Roughly norm.ppf(1e-15)
147 elif cdf_value >= 1.0:
148 x[i] = 8.0 # Roughly norm.ppf(1 - 1e-15)
149 else:
150 # Clip to safe range for scipy
151 from scipy.stats import norm
153 cdf_value = np.clip(cdf_value, 1e-15, 1 - 1e-15)
154 x[i] = norm.ppf(cdf_value)
156 return x
158 def reset(self) -> None:
159 """
160 Reset the transformer to initial state.
162 Reinitializes TDigest objects for all features, clearing CDF estimates.
163 """
164 self.digests: List[TDigest] = [
165 TDigest(max_centroids=self.max_centroids) for _ in range(self.n_dim)
166 ]
168 def to_dict(self) -> Dict[str, Any]:
169 """
170 Serialize the transformer state to a dictionary.
172 Returns
173 -------
174 dict
175 Dictionary with JSON-serializable metadata and TDigest states.
177 Notes
178 -----
179 Uses TDigest's native to_dict() method - fully JSON-serializable.
180 """
181 return {
182 "version": "1.0",
183 "class": "QuantileTransformer",
184 "config": {
185 "n_dim": self.n_dim,
186 "max_centroids": self.max_centroids,
187 "output_distribution": self.output_distribution,
188 },
189 "state": {"digests": [digest.to_dict() for digest in self.digests]},
190 }
192 @classmethod
193 def from_dict(cls, data: Dict[str, Any]) -> "QuantileTransformer":
194 """
195 Deserialize a transformer from a dictionary.
197 Parameters
198 ----------
199 data : dict
200 Dictionary created by to_dict().
202 Returns
203 -------
204 QuantileTransformer
205 Deserialized transformer instance.
206 """
207 if data.get("class") != "QuantileTransformer":
208 raise ValueError(f"Cannot deserialize {data.get('class')} as QuantileTransformer")
210 config = data["config"]
211 instance = cls(
212 n_dim=config["n_dim"],
213 max_centroids=config["max_centroids"],
214 output_distribution=config["output_distribution"],
215 )
217 state = data["state"]
218 instance.digests = [TDigest.from_dict(digest_dict) for digest_dict in state["digests"]]
220 return instance
222 def to_json(self) -> str:
223 """Serialize the transformer to a JSON string."""
224 return json.dumps(self.to_dict(), indent=2)
226 @classmethod
227 def from_json(cls, json_str: str) -> "QuantileTransformer":
228 """Deserialize a transformer from a JSON string."""
229 return cls.from_dict(json.loads(json_str))