Coverage for src/onorm/standard.py: 100%

61 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-07 20:22 +0000

1import base64 

2import json 

3from typing import Any, Dict 

4 

5import numpy as np 

6 

7from .normalization_base import Normalizer 

8 

9 

10class StandardScaler(Normalizer): 

11 r""" 

12 Online standard scaler for z-score normalization using Welford's algorithm. 

13 

14 Transforms features to have zero mean and unit variance using a numerically 

15 stable and memory-efficient online algorithm. Uses Welford's algorithm to 

16 compute mean and variance incrementally without storing historical observations. 

17 

18 For each feature $i$ at time $t$: 

19 

20 - Mean update: $\mu_{t,i} = \mu_{t-1,i} + \frac{x_{t,i} - \mu_{t-1,i}}{t}$ 

21 - Variance (Welford's M): $M_{t,i} = M_{t-1,i} + (x_{t,i} - \mu_{t-1,i})(x_{t,i} - \mu_{t,i})$ 

22 - Sample variance: $\sigma^2_{t,i} = \frac{M_{t,i}}{t - \text{ddof}}$ 

23 - Standardization: $z_{t,i} = \frac{x_{t,i} - \mu_{t,i}}{\sigma_{t,i}}$ 

24 

25 Parameters 

26 ---------- 

27 n_dim : int 

28 Number of dimensions/features to normalize. 

29 with_mean : bool, default=True 

30 If True, center the data by subtracting the mean before scaling. 

31 with_std : bool, default=True 

32 If True, scale the data to unit standard deviation. 

33 ddof : int, default=1 

34 Degrees of freedom for variance calculation (Bessel's correction). 

35 - ddof=1 (default) uses sample variance (divide by n-1) 

36 - ddof=0 uses population variance (divide by n) 

37 

38 Attributes 

39 ---------- 

40 n : int 

41 Number of observations seen so far. 

42 mean : np.ndarray 

43 Running mean for each feature, shape (n_dim,). 

44 M : np.ndarray 

45 Welford's M statistic for variance calculation, shape (n_dim,). 

46 variance : np.ndarray 

47 Computed variance for each feature, shape (n_dim,). This is a property 

48 that calculates variance as `M / (n - ddof)`. 

49 

50 Examples 

51 -------- 

52 ```{python} 

53 from onorm import StandardScaler 

54 import numpy as np 

55 scaler = StandardScaler(n_dim=3) 

56 X = np.random.normal(loc=5, scale=2, size=(100, 3)) 

57 for x in X: 

58 scaler.partial_fit(x) 

59 x_new = np.array([5.0, 5.0, 5.0]) 

60 x_normalized = scaler.transform(x_new.copy()) 

61 # x_normalized will be close to [0, 0, 0] since x_new is near the mean 

62 

63 # Standardize without mean centering 

64 scaler2 = StandardScaler(n_dim=2, with_mean=False) 

65 

66 # Use population variance instead of sample variance 

67 scaler3 = StandardScaler(n_dim=2, ddof=0) 

68 ``` 

69 

70 References 

71 ---------- 

72 [Welford's online algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm) 

73 

74 Notes 

75 ----- 

76 - If fewer than (ddof + 1) observations have been seen, transform returns zeros 

77 - For features with near-zero variance, only centering is applied to avoid 

78 division by zero 

79 """ 

80 

81 def __init__( 

82 self, n_dim: int, with_mean: bool = True, with_std: bool = True, ddof: int = 1 

83 ) -> None: 

84 self.n_dim = n_dim 

85 self.with_mean = with_mean 

86 self.with_std = with_std 

87 self.ddof = ddof 

88 self.reset() 

89 

90 def _update_mean(self, x: np.ndarray) -> np.ndarray: 

91 """ 

92 Update running mean using Welford's algorithm. 

93 

94 Parameters 

95 ---------- 

96 x : np.ndarray 

97 A 1-D array representing a new observation. 

98 

99 Returns 

100 ------- 

101 np.ndarray 

102 The difference between x and the previous mean (delta_old). 

103 """ 

104 delta = x - self.mean 

105 self.mean += delta / self.n 

106 return delta 

107 

108 def _update_variance(self, x: np.ndarray, delta_old: np.ndarray) -> None: 

109 """ 

110 Update Welford's M statistic for variance calculation. 

111 

112 Parameters 

113 ---------- 

114 x : np.ndarray 

115 A 1-D array representing a new observation. 

116 delta_old : np.ndarray 

117 The difference between x and the previous mean. 

118 """ 

119 delta_new = x - self.mean 

120 self.M += delta_old * delta_new 

121 

122 @property 

123 def variance(self) -> np.ndarray: 

124 """ 

125 Calculate the variance from Welford's M statistic. 

126 

127 Returns 

128 ------- 

129 np.ndarray 

130 Variance for each feature, shape (n_dim,). If n <= ddof, returns zeros. 

131 

132 Notes 

133 ----- 

134 The variance is computed as `M / (n - ddof)`, where ddof is the degrees 

135 of freedom correction (Bessel's correction). 

136 """ 

137 if self.n <= self.ddof: 

138 return np.zeros(self.n_dim) 

139 return self.M / (self.n - self.ddof) 

140 

141 def partial_fit(self, x: np.ndarray) -> None: 

142 """ 

143 Update mean and variance estimates using Welford's algorithm. 

144 

145 Parameters 

146 ---------- 

147 x : np.ndarray 

148 A 1-D array of shape (n_dim,) representing a new observation. 

149 """ 

150 self.n += 1 

151 delta_old = self._update_mean(x) 

152 self._update_variance(x, delta_old) 

153 

154 def transform(self, x: np.ndarray) -> np.ndarray: 

155 """ 

156 Standardize features to zero mean and unit variance. 

157 

158 Parameters 

159 ---------- 

160 x : np.ndarray 

161 A 1-D array of shape (n_dim,) to normalize. 

162 

163 Returns 

164 ------- 

165 np.ndarray 

166 Standardized array of shape (n_dim,). If with_mean and with_std are 

167 both True, features will have approximately mean=0 and std=1. 

168 

169 Notes 

170 ----- 

171 - Returns zeros if n <= ddof (insufficient observations) 

172 - For constant features (zero variance), only centering is applied 

173 """ 

174 if self.n <= self.ddof: 

175 # Not enough observations for variance estimate 

176 return np.zeros_like(x) 

177 

178 result = x.copy() 

179 

180 # Center the data 

181 if self.with_mean: 

182 result = result - self.mean 

183 

184 # Scale to unit variance 

185 if self.with_std: 

186 # Calculate standard deviation from variance 

187 std = np.sqrt(self.variance) 

188 

189 # Avoid division by zero - only scale features with non-zero variance 

190 mask = std > np.finfo(np.float64).eps 

191 result[mask] = result[mask] / std[mask] 

192 

193 return result 

194 

195 def reset(self) -> None: 

196 """ 

197 Reset the scaler to initial state. 

198 

199 Resets observation count to 0 and reinitializes mean and variance 

200 statistics to zeros. 

201 """ 

202 self.n = 0 

203 self.mean = np.zeros(self.n_dim) 

204 self.M = np.zeros(self.n_dim) # Welford's M for variance calculation 

205 

206 def to_dict(self) -> Dict[str, Any]: 

207 """ 

208 Serialize the scaler state to a dictionary. 

209 

210 Returns 

211 ------- 

212 dict 

213 Dictionary with JSON-serializable metadata and base64-encoded arrays. 

214 """ 

215 return { 

216 "version": "1.0", 

217 "class": "StandardScaler", 

218 "config": { 

219 "n_dim": self.n_dim, 

220 "with_mean": self.with_mean, 

221 "with_std": self.with_std, 

222 "ddof": self.ddof, 

223 }, 

224 "state": { 

225 "n": self.n, 

226 "mean": base64.b64encode(self.mean.tobytes()).decode("ascii"), 

227 "M": base64.b64encode(self.M.tobytes()).decode("ascii"), 

228 }, 

229 } 

230 

231 @classmethod 

232 def from_dict(cls, data: Dict[str, Any]) -> "StandardScaler": 

233 """ 

234 Deserialize a scaler from a dictionary. 

235 

236 Parameters 

237 ---------- 

238 data : dict 

239 Dictionary created by to_dict(). 

240 

241 Returns 

242 ------- 

243 StandardScaler 

244 Deserialized scaler instance. 

245 """ 

246 if data.get("class") != "StandardScaler": 

247 raise ValueError(f"Cannot deserialize {data.get('class')} as StandardScaler") 

248 

249 config = data["config"] 

250 instance = cls( 

251 n_dim=config["n_dim"], 

252 with_mean=config["with_mean"], 

253 with_std=config["with_std"], 

254 ddof=config["ddof"], 

255 ) 

256 

257 state = data["state"] 

258 instance.n = state["n"] 

259 instance.mean = np.frombuffer(base64.b64decode(state["mean"]), dtype=np.float64) 

260 instance.M = np.frombuffer(base64.b64decode(state["M"]), dtype=np.float64) 

261 

262 return instance 

263 

264 def to_json(self) -> str: 

265 """Serialize the scaler to a JSON string.""" 

266 return json.dumps(self.to_dict(), indent=2) 

267 

268 @classmethod 

269 def from_json(cls, json_str: str) -> "StandardScaler": 

270 """Deserialize a scaler from a JSON string.""" 

271 return cls.from_dict(json.loads(json_str))