Coverage for src/onorm/standard.py: 100%

1import base64

2import json

3from typing import Any, Dict

5import numpy as np

7from .normalization_base import Normalizer

10class StandardScaler(Normalizer):

11 r"""

12 Online standard scaler for z-score normalization using Welford's algorithm.

14 Transforms features to have zero mean and unit variance using a numerically

15 stable and memory-efficient online algorithm. Uses Welford's algorithm to

16 compute mean and variance incrementally without storing historical observations.

18 For each feature $i$ at time $t$:

20 - Mean update: $\mu_{t,i} = \mu_{t-1,i} + \frac{x_{t,i} - \mu_{t-1,i}}{t}$

21 - Variance (Welford's M): $M_{t,i} = M_{t-1,i} + (x_{t,i} - \mu_{t-1,i})(x_{t,i} - \mu_{t,i})$

22 - Sample variance: $\sigma^2_{t,i} = \frac{M_{t,i}}{t - \text{ddof}}$

23 - Standardization: $z_{t,i} = \frac{x_{t,i} - \mu_{t,i}}{\sigma_{t,i}}$

25 Parameters

26 ----------

27 n_dim : int

28 Number of dimensions/features to normalize.

29 with_mean : bool, default=True

30 If True, center the data by subtracting the mean before scaling.

31 with_std : bool, default=True

32 If True, scale the data to unit standard deviation.

33 ddof : int, default=1

34 Degrees of freedom for variance calculation (Bessel's correction).

35 - ddof=1 (default) uses sample variance (divide by n-1)

36 - ddof=0 uses population variance (divide by n)

38 Attributes

39 ----------

40 n : int

41 Number of observations seen so far.

42 mean : np.ndarray

43 Running mean for each feature, shape (n_dim,).

44 M : np.ndarray

45 Welford's M statistic for variance calculation, shape (n_dim,).

46 variance : np.ndarray

47 Computed variance for each feature, shape (n_dim,). This is a property

48 that calculates variance as `M / (n - ddof)`.

50 Examples

51 --------

52 ```{python}

53 from onorm import StandardScaler

54 import numpy as np

55 scaler = StandardScaler(n_dim=3)

56 X = np.random.normal(loc=5, scale=2, size=(100, 3))

57 for x in X:

58 scaler.partial_fit(x)

59 x_new = np.array([5.0, 5.0, 5.0])

60 x_normalized = scaler.transform(x_new.copy())

61 # x_normalized will be close to [0, 0, 0] since x_new is near the mean

63 # Standardize without mean centering

64 scaler2 = StandardScaler(n_dim=2, with_mean=False)

66 # Use population variance instead of sample variance

67 scaler3 = StandardScaler(n_dim=2, ddof=0)

68 ```

70 References

71 ----------

72 [Welford's online algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)

74 Notes

75 -----

76 - If fewer than (ddof + 1) observations have been seen, transform returns zeros

77 - For features with near-zero variance, only centering is applied to avoid

78 division by zero

79 """

81 def __init__(

82 self, n_dim: int, with_mean: bool = True, with_std: bool = True, ddof: int = 1

83 ) -> None:

84 self.n_dim = n_dim

85 self.with_mean = with_mean

86 self.with_std = with_std

87 self.ddof = ddof

88 self.reset()

90 def _update_mean(self, x: np.ndarray) -> np.ndarray:

91 """

92 Update running mean using Welford's algorithm.

94 Parameters

95 ----------

96 x : np.ndarray

97 A 1-D array representing a new observation.

99 Returns

100 -------

101 np.ndarray

102 The difference between x and the previous mean (delta_old).

103 """

104 delta = x - self.mean

105 self.mean += delta / self.n

106 return delta

107

108 def _update_variance(self, x: np.ndarray, delta_old: np.ndarray) -> None:

109 """

110 Update Welford's M statistic for variance calculation.

111

112 Parameters

113 ----------

114 x : np.ndarray

115 A 1-D array representing a new observation.

116 delta_old : np.ndarray

117 The difference between x and the previous mean.

118 """

119 delta_new = x - self.mean

120 self.M += delta_old * delta_new

121

122 @property

123 def variance(self) -> np.ndarray:

124 """

125 Calculate the variance from Welford's M statistic.

126

127 Returns

128 -------

129 np.ndarray

130 Variance for each feature, shape (n_dim,). If n <= ddof, returns zeros.

131

132 Notes

133 -----

134 The variance is computed as `M / (n - ddof)`, where ddof is the degrees

135 of freedom correction (Bessel's correction).

136 """

137 if self.n <= self.ddof:

138 return np.zeros(self.n_dim)

139 return self.M / (self.n - self.ddof)

140

141 def partial_fit(self, x: np.ndarray) -> None:

142 """

143 Update mean and variance estimates using Welford's algorithm.

144

145 Parameters

146 ----------

147 x : np.ndarray

148 A 1-D array of shape (n_dim,) representing a new observation.

149 """

150 self.n += 1

151 delta_old = self._update_mean(x)

152 self._update_variance(x, delta_old)

153

154 def transform(self, x: np.ndarray) -> np.ndarray:

155 """

156 Standardize features to zero mean and unit variance.

157

158 Parameters

159 ----------

160 x : np.ndarray

161 A 1-D array of shape (n_dim,) to normalize.

162

163 Returns

164 -------

165 np.ndarray

166 Standardized array of shape (n_dim,). If with_mean and with_std are

167 both True, features will have approximately mean=0 and std=1.

168

169 Notes

170 -----

171 - Returns zeros if n <= ddof (insufficient observations)

172 - For constant features (zero variance), only centering is applied

173 """

174 if self.n <= self.ddof:

175 # Not enough observations for variance estimate

176 return np.zeros_like(x)

177

178 result = x.copy()

179

180 # Center the data

181 if self.with_mean:

182 result = result - self.mean

183

184 # Scale to unit variance

185 if self.with_std:

186 # Calculate standard deviation from variance

187 std = np.sqrt(self.variance)

188

189 # Avoid division by zero - only scale features with non-zero variance

190 mask = std > np.finfo(np.float64).eps

191 result[mask] = result[mask] / std[mask]

192

193 return result

194

195 def reset(self) -> None:

196 """

197 Reset the scaler to initial state.

198

199 Resets observation count to 0 and reinitializes mean and variance

200 statistics to zeros.

201 """

202 self.n = 0

203 self.mean = np.zeros(self.n_dim)

204 self.M = np.zeros(self.n_dim) # Welford's M for variance calculation

205

206 def to_dict(self) -> Dict[str, Any]:

207 """

208 Serialize the scaler state to a dictionary.

209

210 Returns

211 -------

212 dict

213 Dictionary with JSON-serializable metadata and base64-encoded arrays.

214 """

215 return {

216 "version": "1.0",

217 "class": "StandardScaler",

218 "config": {

219 "n_dim": self.n_dim,

220 "with_mean": self.with_mean,

221 "with_std": self.with_std,

222 "ddof": self.ddof,

223 },

224 "state": {

225 "n": self.n,

226 "mean": base64.b64encode(self.mean.tobytes()).decode("ascii"),

227 "M": base64.b64encode(self.M.tobytes()).decode("ascii"),

228 },

229 }

230

231 @classmethod

232 def from_dict(cls, data: Dict[str, Any]) -> "StandardScaler":

233 """

234 Deserialize a scaler from a dictionary.

235

236 Parameters

237 ----------

238 data : dict

239 Dictionary created by to_dict().

240

241 Returns

242 -------

243 StandardScaler

244 Deserialized scaler instance.

245 """

246 if data.get("class") != "StandardScaler":

247 raise ValueError(f"Cannot deserialize {data.get('class')} as StandardScaler")

248

249 config = data["config"]

250 instance = cls(

251 n_dim=config["n_dim"],

252 with_mean=config["with_mean"],

253 with_std=config["with_std"],

254 ddof=config["ddof"],

255 )

256

257 state = data["state"]

258 instance.n = state["n"]

259 instance.mean = np.frombuffer(base64.b64decode(state["mean"]), dtype=np.float64)

260 instance.M = np.frombuffer(base64.b64decode(state["M"]), dtype=np.float64)

261

262 return instance

263

264 def to_json(self) -> str:

265 """Serialize the scaler to a JSON string."""

266 return json.dumps(self.to_dict(), indent=2)

267

268 @classmethod

269 def from_json(cls, json_str: str) -> "StandardScaler":

270 """Deserialize a scaler from a JSON string."""

271 return cls.from_dict(json.loads(json_str))