
    &Vf                         d dl mZ d dlmZ d dlmZ  edg           G d dej                              Zej                            dej	                  e_        dS )	    )keras_export)adam)	optimizerzkeras.optimizers.AdamWc                   B     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )AdamWa  Optimizer that implements the AdamW algorithm.

    AdamW optimization is a stochastic gradient descent method that is based on
    adaptive estimation of first-order and second-order moments with an added
    method to decay weights per the techniques discussed in the paper,
    'Decoupled Weight Decay Regularization' by
    [Loshchilov, Hutter et al., 2019](https://arxiv.org/abs/1711.05101).

    According to
    [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
    the underying Adam method is "*computationally
    efficient, has little memory requirement, invariant to diagonal rescaling of
    gradients, and is well suited for problems that are large in terms of
    data/parameters*".

    Args:
        learning_rate: A float, a
            `keras.optimizers.schedules.LearningRateSchedule` instance, or
            a callable that takes no arguments and returns the actual value to
            use. The learning rate. Defaults to `0.001`.
        beta_1: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use. The
            exponential decay rate for the 1st moment estimates.
            Defaults to `0.9`.
        beta_2: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use. The
            exponential decay rate for the 2nd moment estimates.
            Defaults to `0.999`.
        epsilon: A small constant for numerical stability. This epsilon is
            "epsilon hat" in the Kingma and Ba paper (in the formula just
            before Section 2.1), not the epsilon in Algorithm 1 of the paper.
            Defaults to 1e-7.
        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
            from the paper "On the Convergence of Adam and beyond".
            Defaults to `False`.
        {{base_optimizer_keyword_args}}

    References:

    - [Loshchilov et al., 2019](https://arxiv.org/abs/1711.05101)
    - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) for `adam`
    - [Reddi et al., 2018](
        https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
    MbP?Mbp??+?Hz>FNGz?adamwc                      t                      j        d||||||||||	|
||||d| | j        t          d          d S )N)learning_ratebeta_1beta_2epsilonamsgradnameweight_decayclipnorm	clipvalueglobal_clipnormuse_emaema_momentumema_overwrite_frequencyloss_scale_factorgradient_accumulation_stepszDArgument `weight_decay` must be a float. Received: weight_decay=None )super__init__r   
ValueError)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                    W/var/www/html/software/conda/lib/python3.11/site-packages/keras/src/optimizers/adamw.pyr!   zAdamW.__init__5   s    & 	 	
'%+%$;/(C	
 	
  !	
 	
 	
& $$   %$    )r   r	   r
   r   r   FNNNFr   NNNr   )__name__
__module____qualname____doc__r!   __classcell__)r%   s   @r&   r   r      sw        + +^  $$(!* * * * * * * * * *r'   r   z{{base_optimizer_keyword_args}}N)
keras.src.api_exportr   keras.src.optimizersr   r   Adamr   r+   replacebase_optimizer_keyword_argsr   r'   r&   <module>r2      s    - - - - - - % % % % % % * * * * * * '())X X X X XDI X X *)Xv %%%y'L r'   