Coverage for src/onorm/standard.py: 100%
61 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 20:22 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-07 20:22 +0000
1import base64
2import json
3from typing import Any, Dict
5import numpy as np
7from .normalization_base import Normalizer
10class StandardScaler(Normalizer):
11 r"""
12 Online standard scaler for z-score normalization using Welford's algorithm.
14 Transforms features to have zero mean and unit variance using a numerically
15 stable and memory-efficient online algorithm. Uses Welford's algorithm to
16 compute mean and variance incrementally without storing historical observations.
18 For each feature $i$ at time $t$:
20 - Mean update: $\mu_{t,i} = \mu_{t-1,i} + \frac{x_{t,i} - \mu_{t-1,i}}{t}$
21 - Variance (Welford's M): $M_{t,i} = M_{t-1,i} + (x_{t,i} - \mu_{t-1,i})(x_{t,i} - \mu_{t,i})$
22 - Sample variance: $\sigma^2_{t,i} = \frac{M_{t,i}}{t - \text{ddof}}$
23 - Standardization: $z_{t,i} = \frac{x_{t,i} - \mu_{t,i}}{\sigma_{t,i}}$
25 Parameters
26 ----------
27 n_dim : int
28 Number of dimensions/features to normalize.
29 with_mean : bool, default=True
30 If True, center the data by subtracting the mean before scaling.
31 with_std : bool, default=True
32 If True, scale the data to unit standard deviation.
33 ddof : int, default=1
34 Degrees of freedom for variance calculation (Bessel's correction).
35 - ddof=1 (default) uses sample variance (divide by n-1)
36 - ddof=0 uses population variance (divide by n)
38 Attributes
39 ----------
40 n : int
41 Number of observations seen so far.
42 mean : np.ndarray
43 Running mean for each feature, shape (n_dim,).
44 M : np.ndarray
45 Welford's M statistic for variance calculation, shape (n_dim,).
46 variance : np.ndarray
47 Computed variance for each feature, shape (n_dim,). This is a property
48 that calculates variance as `M / (n - ddof)`.
50 Examples
51 --------
52 ```{python}
53 from onorm import StandardScaler
54 import numpy as np
55 scaler = StandardScaler(n_dim=3)
56 X = np.random.normal(loc=5, scale=2, size=(100, 3))
57 for x in X:
58 scaler.partial_fit(x)
59 x_new = np.array([5.0, 5.0, 5.0])
60 x_normalized = scaler.transform(x_new.copy())
61 # x_normalized will be close to [0, 0, 0] since x_new is near the mean
63 # Standardize without mean centering
64 scaler2 = StandardScaler(n_dim=2, with_mean=False)
66 # Use population variance instead of sample variance
67 scaler3 = StandardScaler(n_dim=2, ddof=0)
68 ```
70 References
71 ----------
72 [Welford's online algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)
74 Notes
75 -----
76 - If fewer than (ddof + 1) observations have been seen, transform returns zeros
77 - For features with near-zero variance, only centering is applied to avoid
78 division by zero
79 """
81 def __init__(
82 self, n_dim: int, with_mean: bool = True, with_std: bool = True, ddof: int = 1
83 ) -> None:
84 self.n_dim = n_dim
85 self.with_mean = with_mean
86 self.with_std = with_std
87 self.ddof = ddof
88 self.reset()
90 def _update_mean(self, x: np.ndarray) -> np.ndarray:
91 """
92 Update running mean using Welford's algorithm.
94 Parameters
95 ----------
96 x : np.ndarray
97 A 1-D array representing a new observation.
99 Returns
100 -------
101 np.ndarray
102 The difference between x and the previous mean (delta_old).
103 """
104 delta = x - self.mean
105 self.mean += delta / self.n
106 return delta
108 def _update_variance(self, x: np.ndarray, delta_old: np.ndarray) -> None:
109 """
110 Update Welford's M statistic for variance calculation.
112 Parameters
113 ----------
114 x : np.ndarray
115 A 1-D array representing a new observation.
116 delta_old : np.ndarray
117 The difference between x and the previous mean.
118 """
119 delta_new = x - self.mean
120 self.M += delta_old * delta_new
122 @property
123 def variance(self) -> np.ndarray:
124 """
125 Calculate the variance from Welford's M statistic.
127 Returns
128 -------
129 np.ndarray
130 Variance for each feature, shape (n_dim,). If n <= ddof, returns zeros.
132 Notes
133 -----
134 The variance is computed as `M / (n - ddof)`, where ddof is the degrees
135 of freedom correction (Bessel's correction).
136 """
137 if self.n <= self.ddof:
138 return np.zeros(self.n_dim)
139 return self.M / (self.n - self.ddof)
141 def partial_fit(self, x: np.ndarray) -> None:
142 """
143 Update mean and variance estimates using Welford's algorithm.
145 Parameters
146 ----------
147 x : np.ndarray
148 A 1-D array of shape (n_dim,) representing a new observation.
149 """
150 self.n += 1
151 delta_old = self._update_mean(x)
152 self._update_variance(x, delta_old)
154 def transform(self, x: np.ndarray) -> np.ndarray:
155 """
156 Standardize features to zero mean and unit variance.
158 Parameters
159 ----------
160 x : np.ndarray
161 A 1-D array of shape (n_dim,) to normalize.
163 Returns
164 -------
165 np.ndarray
166 Standardized array of shape (n_dim,). If with_mean and with_std are
167 both True, features will have approximately mean=0 and std=1.
169 Notes
170 -----
171 - Returns zeros if n <= ddof (insufficient observations)
172 - For constant features (zero variance), only centering is applied
173 """
174 if self.n <= self.ddof:
175 # Not enough observations for variance estimate
176 return np.zeros_like(x)
178 result = x.copy()
180 # Center the data
181 if self.with_mean:
182 result = result - self.mean
184 # Scale to unit variance
185 if self.with_std:
186 # Calculate standard deviation from variance
187 std = np.sqrt(self.variance)
189 # Avoid division by zero - only scale features with non-zero variance
190 mask = std > np.finfo(np.float64).eps
191 result[mask] = result[mask] / std[mask]
193 return result
195 def reset(self) -> None:
196 """
197 Reset the scaler to initial state.
199 Resets observation count to 0 and reinitializes mean and variance
200 statistics to zeros.
201 """
202 self.n = 0
203 self.mean = np.zeros(self.n_dim)
204 self.M = np.zeros(self.n_dim) # Welford's M for variance calculation
206 def to_dict(self) -> Dict[str, Any]:
207 """
208 Serialize the scaler state to a dictionary.
210 Returns
211 -------
212 dict
213 Dictionary with JSON-serializable metadata and base64-encoded arrays.
214 """
215 return {
216 "version": "1.0",
217 "class": "StandardScaler",
218 "config": {
219 "n_dim": self.n_dim,
220 "with_mean": self.with_mean,
221 "with_std": self.with_std,
222 "ddof": self.ddof,
223 },
224 "state": {
225 "n": self.n,
226 "mean": base64.b64encode(self.mean.tobytes()).decode("ascii"),
227 "M": base64.b64encode(self.M.tobytes()).decode("ascii"),
228 },
229 }
231 @classmethod
232 def from_dict(cls, data: Dict[str, Any]) -> "StandardScaler":
233 """
234 Deserialize a scaler from a dictionary.
236 Parameters
237 ----------
238 data : dict
239 Dictionary created by to_dict().
241 Returns
242 -------
243 StandardScaler
244 Deserialized scaler instance.
245 """
246 if data.get("class") != "StandardScaler":
247 raise ValueError(f"Cannot deserialize {data.get('class')} as StandardScaler")
249 config = data["config"]
250 instance = cls(
251 n_dim=config["n_dim"],
252 with_mean=config["with_mean"],
253 with_std=config["with_std"],
254 ddof=config["ddof"],
255 )
257 state = data["state"]
258 instance.n = state["n"]
259 instance.mean = np.frombuffer(base64.b64decode(state["mean"]), dtype=np.float64)
260 instance.M = np.frombuffer(base64.b64decode(state["M"]), dtype=np.float64)
262 return instance
264 def to_json(self) -> str:
265 """Serialize the scaler to a JSON string."""
266 return json.dumps(self.to_dict(), indent=2)
268 @classmethod
269 def from_json(cls, json_str: str) -> "StandardScaler":
270 """Deserialize a scaler from a JSON string."""
271 return cls.from_dict(json.loads(json_str))