Coverage for src/onorm/minmax.py: 100%

41 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-07 20:22 +0000

1import base64 

2import json 

3from typing import Any, Dict 

4 

5import numpy as np 

6 

7from .normalization_base import Normalizer 

8 

9 

10class MinMaxScaler(Normalizer): 

11 r""" 

12 Online min-max scaler for feature normalization to [0, 1] range. 

13 

14 Tracks the running minimum and maximum for each feature and scales values 

15 to the range [0, 1] based on these statistics. The normalization is updated 

16 incrementally as new observations arrive. 

17 

18 For each feature $i$ at time $t$, tracks: 

19 

20 $$\begin{aligned}\text{min}_i &= \min\{x_{1,i}, \ldots, x_{t,i}\}\\ 

21 \text{max}_i = \max\{x_{1,i}, \ldots, x_{t,i}\}\end{aligned}$$ 

22 

23 And transforms values as: 

24 

25 $$x_{\text{norm},i} = \frac{x_i - \text{min}_i}{\text{max}_i - \text{min}_i}$$ 

26 

27 Parameters 

28 ---------- 

29 n_dim : int 

30 Number of dimensions/features to normalize. 

31 

32 Attributes 

33 ---------- 

34 min : np.ndarray 

35 Running minimum for each feature, shape (n_dim,). 

36 max : np.ndarray 

37 Running maximum for each feature, shape (n_dim,). 

38 

39 Examples 

40 -------- 

41 ```{python} 

42 from onorm import MinMaxScaler 

43 import numpy as np 

44 scaler = MinMaxScaler(n_dim=3) 

45 X = np.random.uniform(-5, 5, size=(100, 3)) 

46 for x in X: 

47 scaler.partial_fit(x) 

48 x_new = np.array([2.0, -1.0, 3.0]) 

49 x_normalized = scaler.transform(x_new.copy()) 

50 assert np.all((x_normalized >= 0) & (x_normalized <= 1)) 

51 ``` 

52 

53 Notes 

54 ----- 

55 - If a feature has constant values (min == max), the transformed value 

56 will be 0 to avoid division by zero. 

57 - This scaler is sensitive to outliers since min/max can be heavily 

58 influenced by extreme values. 

59 """ 

60 

61 def __init__(self, n_dim: int) -> None: 

62 self.n_dim = n_dim 

63 self.reset() 

64 

65 def _update_min(self, x: np.ndarray) -> None: 

66 """Update running minimum for each feature.""" 

67 self.min = np.fmin(self.min, x) 

68 

69 def _update_max(self, x: np.ndarray) -> None: 

70 """Update running maximum for each feature.""" 

71 self.max = np.fmax(self.max, x) 

72 

73 def partial_fit(self, x: np.ndarray) -> None: 

74 """ 

75 Update the minimum and maximum for each feature. 

76 

77 Parameters 

78 ---------- 

79 x : np.ndarray 

80 A 1-D array of shape (n_dim,) representing a new observation. 

81 """ 

82 self._update_min(x) 

83 self._update_max(x) 

84 

85 def transform(self, x: np.ndarray) -> np.ndarray: 

86 """ 

87 Transform features to [0, 1] range using current min/max statistics. 

88 

89 Parameters 

90 ---------- 

91 x : np.ndarray 

92 A 1-D array of shape (n_dim,) to normalize. 

93 

94 Returns 

95 ------- 

96 np.ndarray 

97 Normalized array of shape (n_dim,) with values in [0, 1]. 

98 

99 Notes 

100 ----- 

101 If min == max for a feature (constant feature), returns 0 for that 

102 feature to avoid division by zero. 

103 """ 

104 denom = self.max - self.min 

105 if np.linalg.norm(denom) <= np.finfo(np.float64).eps: 

106 denom = 1 

107 return (x - self.min) / denom 

108 

109 def reset(self) -> None: 

110 """ 

111 Reset the scaler to initial state. 

112 

113 Reinitializes min to positive infinity and max to negative infinity 

114 so that the first observation will set both values. 

115 """ 

116 self.min = np.array([np.inf] * self.n_dim) 

117 self.max = np.array([-np.inf] * self.n_dim) 

118 

119 def to_dict(self) -> Dict[str, Any]: 

120 """ 

121 Serialize the scaler state to a dictionary. 

122 

123 Returns a dictionary with JSON-serializable metadata and base64-encoded 

124 numpy arrays for efficient storage and database compatibility. 

125 

126 Returns 

127 ------- 

128 dict 

129 Dictionary with keys: 

130 - 'version': str, serialization format version 

131 - 'class': str, class name 

132 - 'config': dict, configuration parameters 

133 - 'state': dict, serialized state arrays (base64-encoded) 

134 

135 Examples 

136 -------- 

137 ```{python} 

138 from onorm import MinMaxScaler 

139 import numpy as np 

140 

141 scaler = MinMaxScaler(n_dim=3) 

142 X = np.random.uniform(-5, 5, size=(100, 3)) 

143 for x in X: 

144 scaler.partial_fit(x) 

145 

146 # Serialize 

147 data = scaler.to_dict() 

148 

149 # Could save to database 

150 # db.execute("INSERT INTO models (config, state) VALUES (%s, %s)", 

151 # (json.dumps(data['config']), data['state'])) 

152 ``` 

153 """ 

154 return { 

155 "version": "1.0", 

156 "class": "MinMaxScaler", 

157 "config": {"n_dim": self.n_dim}, 

158 "state": { 

159 "min": base64.b64encode(self.min.tobytes()).decode("ascii"), 

160 "max": base64.b64encode(self.max.tobytes()).decode("ascii"), 

161 }, 

162 } 

163 

164 @classmethod 

165 def from_dict(cls, data: Dict[str, Any]) -> "MinMaxScaler": 

166 """ 

167 Deserialize a scaler from a dictionary. 

168 

169 Parameters 

170 ---------- 

171 data : dict 

172 Dictionary created by to_dict() containing: 

173 - 'version': serialization format version 

174 - 'class': class name (must be 'MinMaxScaler') 

175 - 'config': configuration parameters 

176 - 'state': serialized state arrays 

177 

178 Returns 

179 ------- 

180 MinMaxScaler 

181 Deserialized scaler instance with restored state. 

182 

183 Raises 

184 ------ 

185 ValueError 

186 If the data format is invalid or class name doesn't match. 

187 

188 Examples 

189 -------- 

190 ```{python} 

191 from onorm import MinMaxScaler 

192 

193 # Deserialize from saved data 

194 data = { 

195 "version": "1.0", 

196 "class": "MinMaxScaler", 

197 "config": {"n_dim": 3}, 

198 "state": {"min": "...", "max": "..."} 

199 } 

200 scaler = MinMaxScaler.from_dict(data) 

201 ``` 

202 """ 

203 if data.get("class") != "MinMaxScaler": 

204 raise ValueError(f"Cannot deserialize {data.get('class')} as MinMaxScaler") 

205 

206 # Create instance with config 

207 config = data["config"] 

208 instance = cls(n_dim=config["n_dim"]) 

209 

210 # Restore state arrays 

211 state = data["state"] 

212 instance.min = np.frombuffer(base64.b64decode(state["min"]), dtype=np.float64) 

213 instance.max = np.frombuffer(base64.b64decode(state["max"]), dtype=np.float64) 

214 

215 return instance 

216 

217 def to_json(self) -> str: 

218 """ 

219 Serialize the scaler to a JSON string. 

220 

221 Returns 

222 ------- 

223 str 

224 JSON string representation of the scaler state. 

225 

226 Examples 

227 -------- 

228 ```{python} 

229 from onorm import MinMaxScaler 

230 

231 scaler = MinMaxScaler(n_dim=3) 

232 # ... train scaler ... 

233 json_str = scaler.to_json() 

234 ``` 

235 """ 

236 return json.dumps(self.to_dict(), indent=2) 

237 

238 @classmethod 

239 def from_json(cls, json_str: str) -> "MinMaxScaler": 

240 """ 

241 Deserialize a scaler from a JSON string. 

242 

243 Parameters 

244 ---------- 

245 json_str : str 

246 JSON string created by to_json(). 

247 

248 Returns 

249 ------- 

250 MinMaxScaler 

251 Deserialized scaler instance. 

252 

253 Examples 

254 -------- 

255 ```{python} 

256 from onorm import MinMaxScaler 

257 

258 # Deserialize from JSON string 

259 scaler = MinMaxScaler.from_json(json_str) 

260 ``` 

261 """ 

262 return cls.from_dict(json.loads(json_str))