1. 程式人生 > >Deep Learning 32: 自己寫的keras的一個callbacks函式,解決keras中不能在每個epoch實時顯示學習速率learning rate的問題

Deep Learning 32: 自己寫的keras的一個callbacks函式,解決keras中不能在每個epoch實時顯示學習速率learning rate的問題

  1 from __future__ import absolute_import
  2 from . import backend as K
  3 from .utils.generic_utils import get_from_module
  4 from six.moves import zip
  5 
  6 
  7 def clip_norm(g, c, n):
  8     if c > 0:
  9         g = K.switch(n >= c, g * c / n, g)
 10     return g
 11 
 12 
 13 def optimizer_from_config(config, custom_objects={}):
14 all_classes = { 15 'sgd': SGD, 16 'rmsprop': RMSprop, 17 'adagrad': Adagrad, 18 'adadelta': Adadelta, 19 'adam': Adam, 20 'adamax': Adamax, 21 'nadam': Nadam, 22 'tfoptimizer': TFOptimizer, 23 } 24 class_name = config['
class_name'] 25 if class_name in custom_objects: 26 cls = custom_objects[class_name] 27 else: 28 if class_name.lower() not in all_classes: 29 raise ValueError('Optimizer class not found:', class_name) 30 cls = all_classes[class_name.lower()] 31 return
cls.from_config(config['config']) 32 33 34 class Optimizer(object): 35 '''Abstract optimizer base class. 36 37 Note: this is the parent class of all optimizers, not an actual optimizer 38 that can be used for training models. 39 40 All Keras optimizers support the following keyword arguments: 41 42 clipnorm: float >= 0. Gradients will be clipped 43 when their L2 norm exceeds this value. 44 clipvalue: float >= 0. Gradients will be clipped 45 when their absolute value exceeds this value. 46 ''' 47 def __init__(self, **kwargs): 48 allowed_kwargs = {'clipnorm', 'clipvalue'} 49 for k in kwargs: 50 if k not in allowed_kwargs: 51 raise TypeError('Unexpected keyword argument ' 52 'passed to optimizer: ' + str(k)) 53 self.__dict__.update(kwargs) 54 self.updates = [] 55 self.weights = [] 56 57 def get_updates(self, params, constraints, loss): 58 raise NotImplementedError 59 60 def get_gradients(self, loss, params): 61 grads = K.gradients(loss, params) 62 if hasattr(self, 'clipnorm') and self.clipnorm > 0: 63 norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads])) 64 grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 65 if hasattr(self, 'clipvalue') and self.clipvalue > 0: 66 grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads] 67 return grads 68 69 def set_weights(self, weights): 70 '''Sets the weights of the optimizer, from Numpy arrays. 71 72 Should only be called after computing the gradients 73 (otherwise the optimizer has no weights). 74 75 # Arguments 76 weights: a list of Numpy arrays. The number 77 of arrays and their shape must match 78 number of the dimensions of the weights 79 of the optimizer (i.e. it should match the 80 output of `get_weights`). 81 ''' 82 params = self.weights 83 weight_value_tuples = [] 84 param_values = K.batch_get_value(params) 85 for pv, p, w in zip(param_values, params, weights): 86 if pv.shape != w.shape: 87 raise ValueError('Optimizer weight shape ' + 88 str(pv.shape) + 89 ' not compatible with ' 90 'provided weight shape ' + str(w.shape)) 91 weight_value_tuples.append((p, w)) 92 K.batch_set_value(weight_value_tuples) 93 94 def get_weights(self): 95 '''Returns the current weights of the optimizer, 96 as a list of numpy arrays. 97 ''' 98 return K.batch_get_value(self.weights) 99 100 def get_config(self): 101 config = {} 102 if hasattr(self, 'clipnorm'): 103 config['clipnorm'] = self.clipnorm 104 if hasattr(self, 'clipvalue'): 105 config['clipvalue'] = self.clipvalue 106 return config 107 108 @classmethod 109 def from_config(cls, config): 110 return cls(**config) 111 112 113 class SGD(Optimizer): 114 '''Stochastic gradient descent, with support for momentum, 115 learning rate decay, and Nesterov momentum. 116 117 # Arguments 118 lr: float >= 0. Learning rate. 119 momentum: float >= 0. Parameter updates momentum. 120 decay: float >= 0. Learning rate decay over each update. 121 nesterov: boolean. Whether to apply Nesterov momentum. 122 ''' 123 def __init__(self, lr=0.01, momentum=0., decay=0., 124 nesterov=False, **kwargs): 125 super(SGD, self).__init__(**kwargs) 126 self.__dict__.update(locals()) 127 self.iterations = K.variable(0.) 128 self.lr = K.variable(lr) 129 self.momentum = K.variable(momentum) 130 self.decay = K.variable(decay) 131 self.inital_decay = decay 132 133 def get_updates(self, params, constraints, loss): 134 grads = self.get_gradients(loss, params) 135 self.updates = [] 136 137 lr = self.lr 138 if self.inital_decay > 0: 139 lr *= (1. / (1. + self.decay * self.iterations)) 140 self.updates .append(K.update_add(self.iterations, 1)) 141 142 # momentum 143 shapes = [K.get_variable_shape(p) for p in params] 144 moments = [K.zeros(shape) for shape in shapes] 145 self.weights = [self.iterations] + moments 146 for p, g, m in zip(params, grads, moments): 147 v = self.momentum * m - lr * g # velocity 148 self.updates.append(K.update(m, v)) 149 150 if self.nesterov: 151 new_p = p + self.momentum * v - lr * g 152 else: 153 new_p = p + v 154 155 # apply constraints 156 if p in constraints: 157 c = constraints[p] 158 new_p = c(new_p) 159 160 self.updates.append(K.update(p, new_p)) 161 return self.updates 162 163 def get_config(self): 164 config = {'lr': float(K.get_value(self.lr)), 165 'momentum': float(K.get_value(self.momentum)), 166 'decay': float(K.get_value(self.decay)), 167 'nesterov': self.nesterov} 168 base_config = super(SGD, self).get_config() 169 return dict(list(base_config.items()) + list(config.items())) 170 171 172 class RMSprop(Optimizer): 173 '''RMSProp optimizer. 174 175 It is recommended to leave the parameters of this optimizer 176 at their default values 177 (except the learning rate, which can be freely tuned). 178 179 This optimizer is usually a good choice for recurrent 180 neural networks. 181 182 # Arguments 183 lr: float >= 0. Learning rate. 184 rho: float >= 0. 185 epsilon: float >= 0. Fuzz factor. 186 decay: float >= 0. Learning rate decay over each update. 187 ''' 188 def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., 189 **kwargs): 190 super(RMSprop, self).__init__(**kwargs) 191 self.__dict__.update(locals()) 192 self.lr = K.variable(lr) 193 self.rho = K.variable(rho) 194 self.decay = K.variable(decay) 195 self.inital_decay = decay 196 self.iterations = K.variable(0.) 197 198 def get_updates(self, params, constraints, loss): 199 grads = self.get_gradients(loss, params) 200 shapes = [K.get_variable_shape(p) for p in params] 201 accumulators = [K.zeros(shape) for shape in shapes] 202 self.weights = accumulators 203 self.updates = [] 204 205 lr = self.lr 206 if self.inital_decay > 0: 207 lr *= (1. / (1. + self.decay * self.iterations)) 208 self.updates.append(K.update_add(self.iterations, 1)) 209 210 for p, g, a in zip(params, grads, accumulators): 211 # update accumulator 212 new_a = self.rho * a + (1. - self.rho) * K.square(g) 213 self.updates.append(K.update(a, new_a)) 214 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 215 216 # apply constraints 217 if p in constraints: 218 c = constraints[p] 219 new_p = c(new_p) 220 self.updates.append(K.update(p, new_p)) 221 return self.updates 222 223 def get_config(self): 224 config = {'lr': float(K.get_value(self.lr)), 225 'rho': float(K.get_value(self.rho)), 226 'decay': float(K.get_value(self.decay)), 227 'epsilon': self.epsilon} 228 base_config = super(RMSprop, self).get_config() 229 return dict(list(base_config.items()) + list(config.items())) 230 231 232 class Adagrad(Optimizer): 233 '''Adagrad optimizer. 234 235 It is recommended to leave the parameters of this optimizer 236 at their default values. 237 238 # Arguments 239 lr: float >= 0. Learning rate. 240 epsilon: float >= 0. 241 242 # References 243 - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 244 ''' 245 def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs): 246 super(Adagrad, self).__init__(**kwargs) 247 self.__dict__.update(locals()) 248 self.lr = K.variable(lr) 249 self.decay = K.variable(decay) 250 self.inital_decay = decay 251 self.iterations = K.variable(0.) 252 253 def get_updates(self, params, constraints, loss): 254 grads = self.get_gradients(loss, params) 255 shapes = [K.get_variable_shape(p) for p in params] 256 accumulators = [K.zeros(shape) for shape in shapes] 257 self.weights = accumulators 258 self.updates = [] 259 260 lr = self.lr 261 if self.inital_decay > 0: 262 lr *= (1. / (1. + self.decay * self.iterations)) 263 self.updates.append(K.update_add(self.iterations, 1)) 264 265 for p, g, a in zip(params, grads, accumulators): 266 new_a = a + K.square(g) # update accumulator 267 self.updates.append(K.update(a, new_a)) 268 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 269 # apply constraints 270 if p in constraints: 271 c = constraints[p] 272 new_p = c(new_p) 273 self.updates.append(K.update(p, new_p)) 274 return self.updates 275 276 def get_config(self): 277 config = {'lr': float(K.get_value(self.lr)), 278 'decay': float(K.get_value(self.decay)), 279 'epsilon': self.epsilon} 280 base_config = super(Adagrad, self).get_config() 281 return dict(list(base_config.items()) + list(config.items())) 282 283 284 class Adadelta(Optimizer): 285 '''Adadelta optimizer. 286 287 It is recommended to leave the parameters of this optimizer 288 at their default values. 289 290 # Arguments 291 lr: float >= 0. Learning rate. 292 It is recommended to leave it at the default value. 293 rho: float >= 0. 294 epsilon: float >= 0. Fuzz factor. 295 296 # References 297 - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) 298 ''' 299 def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., 300 **kwargs): 301 super(Adadelta, self).__init__(**kwargs) 302 self.__dict__.update(locals()) 303 self.lr = K.variable(lr) 304 self.decay = K.variable(decay) 305 self.inital_decay = decay 306 self.iterations = K.variable(0.) 307 308 def get_updates(self, params, constraints, loss): 309 grads = self.get_gradients(loss, params) 310 shapes = [K.get_variable_shape(p) for p in params] 311 accumulators = [K.zeros(shape) for shape in shapes] 312 delta_accumulators = [K.zeros(shape) for shape in shapes] 313 self.weights = accumulators + delta_accumulators 314 self.updates = [] 315 316 lr = self.lr 317 if self.inital_decay > 0: 318 lr *= (1. / (1. + self.decay * self.iterations)) 319 self.updates.append(K.update_add(self.iterations, 1)) 320 321 for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 322 # update accumulator 323 new_a = self.rho * a + (1. - self.rho) * K.square(g) 324 self.updates.append(K.update(a, new_a)) 325 326 # use the new accumulator and the *old* delta_accumulator 327 update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) 328 329 new_p = p - lr * update 330 # apply constraints 331 if p in constraints: 332 c = constraints[p] 333 new_p = c(new_p) 334 self.updates.append(K.update(p, new_p)) 335 336 # update delta_accumulator 337 new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) 338 self.updates.append(K.update(d_a, new_d_a)) 339 return self.updates 340 341 def get_config(self): 342 config = {'lr': float(K.get_value(self.lr)), 343 'rho': self.rho, 344 'decay': float(K.get_value(self.decay)), 345 'epsilon': self.epsilon} 346 base_config = super(Adadelta, self).get_config() 347 return dict(list(base_config.items()) + list(config.items())) 348 349 350 class Adam(Optimizer): 351 '''Adam optimizer. 352 353 Default parameters follow those provided in the original paper. 354 355 # Arguments 356 lr: float >= 0. Learning rate. 357 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 358 epsilon: float >= 0. Fuzz factor. 359 360 # References 361 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 362 ''' 363 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, 364 epsilon=1e-8, decay=0., **kwargs): 365 super(Adam, self).__init__(**kwargs) 366 self.__dict__.update(locals()) 367 self.iterations = K.variable(0) 368 self.lr = K.variable(lr) 369 self.beta_1 = K.variable(beta_1) 370 self.beta_2 = K.variable(beta_2) 371 self.decay = K.variable(decay) 372 self.inital_decay = decay 373 374 def get_updates(self, params, constraints, loss): 375 grads = self.get_gradients(loss, params) 376 self.updates = [K.update_add(self.iterations, 1)] 377 378 lr = self.lr 379 if self.inital_decay > 0: 380 lr *= (1. / (1. + self.decay * self.iterations)) 381 382 t = self.iterations + 1 383 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) 384 385 shapes = [K.get_variable_shape(p) for p in params] 386 ms = [K.zeros(shape) for shape in shapes] 387 vs = [K.zeros(shape) for shape in shapes] 388 self.weights = [self.iterations] + ms + vs 389 390 for p, g, m, v in zip(params, grads, ms, vs): 391 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 392 v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 393 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 394 395 self.updates.append(K.update(m, m_t)) 396 self.updates.append(K.update(v, v_t)) 397 398 new_p = p_t 399 # apply constraints 400 if p in constraints: 401 c = constraints[p] 402 new_p = c(new_p) 403 self.updates.append(K.update(p, new_p)) 404 return self.updates 405 406 def get_config(self): 407 config = {'lr': float(K.get_value(self.lr)), 408 'beta_1': float(K.get_value(self.beta_1)), 409 'beta_2': float(K.get_value(self.beta_2)), 410 'decay': float(K.get_value(self.decay)), 411 'epsilon': self.epsilon} 412 base_config = super(Adam, self).get_config() 413 return dict(list(base_config.items()) + list(config.items())) 414 415 416 class Adamax(Optimizer): 417 '''Adamax optimizer from Adam paper's Section 7. It is a variant 418 of Adam based on the infinity norm. 419 420 Default parameters follow those provided in the paper. 421 422 # Arguments 423 lr: float >= 0. Learning rate. 424 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 425 epsilon: float >= 0. Fuzz factor. 426 427 # References 428 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 429 ''' 430 def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, 431 epsilon=1e-8, decay=0., **kwargs): 432 super(Adamax, self).__init__(**kwargs) 433 self.__dict__.update(locals()) 434 self.iterations = K.variable(0.) 435 self.lr = K.variable(lr) 436 self.beta_1 = K.variable(beta_1) 437 self.beta_2 = K.variable(beta_2) 438 self.decay = K.variable(decay) 439 self.inital_decay = decay 440 441 def get_updates(self, params, constraints, loss): 442 grads = self.get_gradients(loss, params) 443 self.updates = [K.update_add(self.iterations, 1)] 444 445 lr = self.lr 446 if self.inital_decay > 0: 447 lr *= (1. / (1. + self.decay * self.iterations)) 448 449 t = self.iterations + 1 450 lr_t = lr / (1. - K.pow(self.beta_1, t)) 451 452 shapes = [K.get_variable_shape(p) for p in params] 453 # zero init of 1st moment 454 ms = [K.zeros(shape) for shape in shapes] 455 # zero init of exponentially weighted infinity norm 456 us = [K.zeros(shape) for shape in shapes] 457 self.weights = [self.iterations] + ms + us 458 459 for p, g, m, u in zip(params, grads, ms, us): 460 461 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 462 u_t = K.maximum(self.beta_2 * u, K.abs(g)) 463 p_t = p - lr_t * m_t / (u_t + self.epsilon) 464 465 self.updates.append(K.update(m, m_t)) 466 self.updates.append(K.update(u, u_t)) 467 468 new_p = p_t 469 # apply constraints 470 if p in constraints: 471 c = constraints[p] 472 new_p = c(new_p) 473 self.updates.append(K.update(p, new_p)) 474 return self.updates 475 476 def get_config(self): 477 config = {'lr': float(K.get_value(self.lr)), 478 'beta_1': float(K.get_value(self.beta_1)), 479 'beta_2': float(K.get_value(self.beta_2)), 480 'decay': float(K.get_value(self.decay)), 481 'epsilon': self.epsilon} 482 base_config = super(Adamax, self).get_config() 483 return dict(list(base_config.items()) + list(config.items())) 484 485 486 class Nadam(Optimizer): 487 ''' 488 Nesterov Adam optimizer: Much like Adam is essentially RMSprop with momentum, 489 Nadam is Adam RMSprop with Nesterov momentum. 490 491 Default parameters follow those provided in the paper. 492 It is recommended to leave the parameters of this optimizer 493 at their default values. 494 495 # Arguments 496 lr: float >= 0. Learning rate. 497 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 498 epsilon: float >= 0. Fuzz factor. 499 500 # References 501 - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf) 502 - [On the importance of initialization and momentum in deep l