Coverage for src/onorm/quantile.py: 100%

48 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-07 20:22 +0000

1""" 

2Quantile-based normalization using TDigest for online CDF estimation. 

3""" 

4 

5import json 

6from typing import Any, Dict, List 

7 

8import numpy as np 

9from fastdigest import TDigest 

10 

11from .normalization_base import Normalizer 

12 

13 

14class QuantileTransformer(Normalizer): 

15 r""" 

16 Online quantile-based normalization using marginal CDF estimation. 

17 

18 Transforms features to follow a uniform distribution on [0, 1] by mapping 

19 each value to its empirical cumulative distribution function (CDF) value. 

20 Uses TDigest for efficient online quantile estimation without storing all 

21 historical data. 

22 

23 For each feature $i$, the transformation is: 

24 

25 $$x_{\text{norm},i} = F_i(x_i)$$ 

26 

27 where $F_i$ is the estimated cumulative distribution function for feature $i$. 

28 

29 Parameters 

30 ---------- 

31 n_dim : int 

32 Number of dimensions/features to normalize 

33 max_centroids : int, default=1000 

34 Maximum number of centroids for TDigest. Higher values increase precision 

35 but use more memory. Typically 100-1000 is sufficient. 

36 output_distribution : str, default='uniform' 

37 Target output distribution: 

38 - 'uniform': Output in [0, 1] (raw CDF values) 

39 - 'normal': Apply inverse normal CDF to get standard normal output 

40 

41 Attributes 

42 ---------- 

43 digests : List[TDigest] 

44 List of TDigest objects for tracking marginal distributions per feature. 

45 

46 Examples 

47 -------- 

48 ```{python} 

49 from onorm import QuantileTransformer 

50 import numpy as np 

51 

52 # Create transformer 

53 qt = QuantileTransformer(n_dim=3) 

54 

55 # Fit on skewed data 

56 X = np.random.exponential(scale=2.0, size=(1000, 3)) 

57 for x in X: 

58 qt.partial_fit(x) 

59 

60 # Transform maps to uniform [0, 1] 

61 x_new = np.array([0.5, 1.0, 5.0]) 

62 x_uniform = qt.transform(x_new.copy()) # Values close to 0, 0.4, 0.9 

63 ``` 

64 

65 Notes 

66 ----- 

67 - Transforms arbitrary distributions to uniform [0, 1] 

68 - Robust to outliers and heavy-tailed distributions 

69 - TDigest provides approximate quantiles with bounded memory 

70 - Transformation is monotonic within each feature 

71 - Features are transformed independently (marginal CDFs) 

72 - Values outside the observed range are clipped to [0, 1] 

73 

74 References 

75 ---------- 

76 [Computing Extremely Accurate Quantiles Using t-Digests](https://arxiv.org/abs/1902.04023) 

77 

78 See Also 

79 -------- 

80 Winsorizer : For robust outlier clipping at quantiles 

81 StandardScaler : For Gaussian-based normalization 

82 """ 

83 

84 def __init__( 

85 self, 

86 n_dim: int, 

87 max_centroids: int = 1000, 

88 output_distribution: str = "uniform", 

89 ) -> None: 

90 if output_distribution not in ("uniform", "normal"): 

91 raise ValueError( 

92 f"output_distribution must be 'uniform' or 'normal', got '{output_distribution}'" 

93 ) 

94 

95 self.n_dim = n_dim 

96 self.max_centroids = max_centroids 

97 self.output_distribution = output_distribution 

98 self.reset() 

99 

100 def partial_fit(self, x: np.ndarray) -> None: 

101 """ 

102 Update CDF estimates for each feature. 

103 

104 Parameters 

105 ---------- 

106 x : np.ndarray 

107 A 1-D array of shape (n_dim,) representing a new observation. 

108 """ 

109 for i, xi in enumerate(x): 

110 self.digests[i].update(xi.item()) 

111 

112 def transform(self, x: np.ndarray) -> np.ndarray: 

113 """ 

114 Transform data to target distribution using estimated CDFs. 

115 

116 Parameters 

117 ---------- 

118 x : np.ndarray 

119 A 1-D array of shape (n_dim,) to transform. 

120 

121 Returns 

122 ------- 

123 np.ndarray 

124 Transformed array where each value is mapped to its CDF value. 

125 If output_distribution='uniform': values in [0, 1] 

126 If output_distribution='normal': values are standard normal 

127 

128 Notes 

129 ----- 

130 Values smaller than all observed data receive CDF ≈ 0. 

131 Values larger than all observed data receive CDF ≈ 1. 

132 """ 

133 for i in range(self.n_dim): 

134 # Get CDF value for this observation 

135 cdf_value = self.digests[i].cdf(x[i].item()) 

136 

137 # Clip to [0, 1] in case of numerical issues 

138 cdf_value = np.clip(cdf_value, 0.0, 1.0) 

139 

140 if self.output_distribution == "uniform": 

141 x[i] = cdf_value 

142 else: # normal 

143 # Apply inverse normal CDF (probit function) 

144 # Handle edge cases to avoid inf 

145 if cdf_value <= 0.0: 

146 x[i] = -8.0 # Roughly norm.ppf(1e-15) 

147 elif cdf_value >= 1.0: 

148 x[i] = 8.0 # Roughly norm.ppf(1 - 1e-15) 

149 else: 

150 # Clip to safe range for scipy 

151 from scipy.stats import norm 

152 

153 cdf_value = np.clip(cdf_value, 1e-15, 1 - 1e-15) 

154 x[i] = norm.ppf(cdf_value) 

155 

156 return x 

157 

158 def reset(self) -> None: 

159 """ 

160 Reset the transformer to initial state. 

161 

162 Reinitializes TDigest objects for all features, clearing CDF estimates. 

163 """ 

164 self.digests: List[TDigest] = [ 

165 TDigest(max_centroids=self.max_centroids) for _ in range(self.n_dim) 

166 ] 

167 

168 def to_dict(self) -> Dict[str, Any]: 

169 """ 

170 Serialize the transformer state to a dictionary. 

171 

172 Returns 

173 ------- 

174 dict 

175 Dictionary with JSON-serializable metadata and TDigest states. 

176 

177 Notes 

178 ----- 

179 Uses TDigest's native to_dict() method - fully JSON-serializable. 

180 """ 

181 return { 

182 "version": "1.0", 

183 "class": "QuantileTransformer", 

184 "config": { 

185 "n_dim": self.n_dim, 

186 "max_centroids": self.max_centroids, 

187 "output_distribution": self.output_distribution, 

188 }, 

189 "state": {"digests": [digest.to_dict() for digest in self.digests]}, 

190 } 

191 

192 @classmethod 

193 def from_dict(cls, data: Dict[str, Any]) -> "QuantileTransformer": 

194 """ 

195 Deserialize a transformer from a dictionary. 

196 

197 Parameters 

198 ---------- 

199 data : dict 

200 Dictionary created by to_dict(). 

201 

202 Returns 

203 ------- 

204 QuantileTransformer 

205 Deserialized transformer instance. 

206 """ 

207 if data.get("class") != "QuantileTransformer": 

208 raise ValueError(f"Cannot deserialize {data.get('class')} as QuantileTransformer") 

209 

210 config = data["config"] 

211 instance = cls( 

212 n_dim=config["n_dim"], 

213 max_centroids=config["max_centroids"], 

214 output_distribution=config["output_distribution"], 

215 ) 

216 

217 state = data["state"] 

218 instance.digests = [TDigest.from_dict(digest_dict) for digest_dict in state["digests"]] 

219 

220 return instance 

221 

222 def to_json(self) -> str: 

223 """Serialize the transformer to a JSON string.""" 

224 return json.dumps(self.to_dict(), indent=2) 

225 

226 @classmethod 

227 def from_json(cls, json_str: str) -> "QuantileTransformer": 

228 """Deserialize a transformer from a JSON string.""" 

229 return cls.from_dict(json.loads(json_str))