google
diff --git a/‎examples/lm1b_nnx/models_test.py‎
Lines changed: 96 additions & 0 deletions b/‎examples/lm1b_nnx/models_test.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎flax/nnx/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎flax/nnx/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎flax/nnx/module.py‎
Lines changed: 132 additions & 0 deletions b/‎flax/nnx/module.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎flax/nnx/nn/attention.py‎
Lines changed: 45 additions & 0 deletions b/‎flax/nnx/nn/attention.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎flax/nnx/nn/normalization.py‎
Lines changed: 15 additions & 0 deletions b/‎flax/nnx/nn/normalization.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎flax/nnx/nn/stochastic.py‎
Lines changed: 14 additions & 0 deletions b/‎flax/nnx/nn/stochastic.py‎
Lines changed: 14 additions & 0 deletions
@@ -291,6 +291,102 @@ def test_forward_decode(self):
     for output_nnx, output_linen in zip(outputs_nnx, outputs_linen):
       assert jnp.allclose(output_nnx, output_linen, atol=1e-5)
 
+  def test_forward_eval_set_mode(self):
+    _, config = get_transformer_config(
+      axis_rules=default.MeshRules(
+        embed='model',
+        mlp='data',
+        kv=None,
+        vocab=None,
+      ),
+      deterministic=True,
+      decode=False,
+    )
+    # Set dropout rates to avoid create dropout states
+    config.dropout_rate = 0.0
+    config.attention_dropout_rate = 0.0
+
+    model_nnx = nnx.eval_shape(lambda: TransformerLM(config, rngs=nnx.Rngs(0)))
+    _, params_nnx = nnx.split(model_nnx, nnx.Param)
+
+    model_linen = TransformerLinen(config)
+
+    sample_inputs = random.randint(random.PRNGKey(0), (1, 3), 0, 20)
+    params_linen = model_linen.init(random.key(0), sample_inputs)['params']
+
+    self.transfer_params(config, params_nnx, params_linen)
+    nnx.update(model_nnx, params_nnx)
+
+    det_model = nnx.set_mode(model_nnx, deterministic=True, decode=False)
+    output_nnx = det_model(sample_inputs)
+
+    output_linen: jax.Array = model_linen.apply(
+      {'params': params_linen}, sample_inputs
+    )
+
+    assert jnp.allclose(output_nnx, output_linen, atol=1e-5)
+
+  def test_forward_decode_set_mode(self):
+    batch_size = 2
+
+    _, config = get_transformer_config(
+      axis_rules=default.MeshRules(
+        embed='model',
+        mlp='data',
+        kv=None,
+        vocab=None,
+      ),
+      deterministic=True,
+      decode=True,
+    )
+    # Set dropout rates to avoid create dropout states
+    config.dropout_rate = 0.0
+    config.attention_dropout_rate = 0.0
+
+    model_nnx = nnx.eval_shape(lambda: TransformerLM(config, rngs=nnx.Rngs(0)))
+    for _path, m in model_nnx.iter_modules():
+      if isinstance(m, HasCache):
+        input_shape = (batch_size, config.max_len, config.emb_dim)
+        m.init_cache(input_shape, dtype=config.dtype)
+
+    _, params_nnx, cache_nnx = nnx.split(model_nnx, nnx.Param, nnx.Cache)
+
+    model_linen = TransformerLinen(config)
+
+    flax_init_inputs = random.randint(
+      random.PRNGKey(0), (batch_size, config.max_len), 0, config.vocab_size
+    )
+    ar_decode_inputs = random.randint(
+      random.PRNGKey(0), (3, batch_size, 1), 0, config.vocab_size
+    )
+    variables = model_linen.init(random.key(0), flax_init_inputs)
+    params_linen = variables['params']
+    cache_linen = variables['cache']
+
+    self.transfer_params(config, params_nnx, params_linen)
+    self.transfer_cache(config, cache_nnx, cache_linen)
+    nnx.update(model_nnx, params_nnx, cache_nnx)
+    det_model = nnx.set_mode(model_nnx, deterministic=True, decode=True)
+
+    outputs_nnx = []
+    outputs_linen = []
+
+    for inputs in ar_decode_inputs:
+      output_nnx = det_model(inputs)
+      outputs_nnx.append(output_nnx)
+
+    output_linen: jax.Array
+    for inputs in ar_decode_inputs:
+      output_linen, updates = model_linen.apply(
+        {'params': params_linen, 'cache': cache_linen},
+        inputs,
+        mutable=['cache'],
+      )
+      cache_linen = updates['cache']
+      outputs_linen.append(output_linen)
+
+    for output_nnx, output_linen in zip(outputs_nnx, outputs_linen):
+      assert jnp.allclose(output_nnx, output_linen, atol=1e-5)
 
 if __name__ == '__main__':
   absltest.main()
@@ -47,6 +47,9 @@
 from .helpers import TrainState as TrainState
 from .module import M as M
 from .module import Module as Module
+from .module import set_mode as set_mode
+from .module import train_mode as train_mode
+from .module import eval_mode as eval_mode
 from .module import iter_children as iter_children, iter_modules as iter_modules
 from .graph import merge as merge
 from .graph import UpdateContext as UpdateContext
 
@@ -427,6 +427,138 @@ def eval(self, **attributes):
       raise_if_not_found=False,
     )
 
+def set_mode(node: A, /, *, only: filterlib.Filter = ..., raise_if_not_found: bool = True,  **kwargs) -> A:
+  """Creates a new node with static attributes updated according to ``**kwargs``.
+
+  The new node contains references to jax arrays in the original node. If a
+  kwarg is not found in any module, this method raises a ValueError. ``set_mode``
+  class methods should return any unused kwargs.
+
+  Example::
+    >>> from flax import nnx
+    ...
+    >>> class Block(nnx.Module):
+    ...   def __init__(self, din, dout, *, rngs: nnx.Rngs):
+    ...     self.linear = nnx.Linear(din, dout, rngs=rngs)
+    ...     self.dropout = nnx.Dropout(0.5, deterministic=False)
+    ...     self.batch_norm = nnx.BatchNorm(10, use_running_average=False, rngs=rngs)
+    ...
+    >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+    >>> block.dropout.deterministic, block.batch_norm.use_running_average
+    (False, False)
+    >>> new_block = nnx.set_mode(block, deterministic=True, use_running_average=True)
+    >>> new_block.dropout.deterministic, new_block.batch_norm.use_running_average
+    (True, True)
+
+  ``Filter``'s can be used to set the attributes of specific Modules::
+    >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+    >>> new_block = nnx.set_mode(block, only=nnx.Dropout, deterministic=True)
+    >>> # Only the dropout will be modified
+    >>> new_block.dropout.deterministic, new_block.batch_norm.use_running_average
+    (True, False)
+
+  Args:
+    node: the object to create a copy of.
+    only: Filters to select the Modules to set the attributes of.
+    **kwargs: The attributes to set.
+  """
+  predicate = filterlib.to_predicate(only)
+
+  counts = {k: 0 for k in kwargs}
+  counts["_set_mode_calls"] = 0
+
+  def _set_mode_fn(path, node):
+    if hasattr(node, 'set_mode') and predicate(path, node):
+      counts["_set_mode_calls"] += 1
+      unused = node.set_mode(**kwargs)
+      for k in unused:
+        counts[k] += 1
+    return node
+
+  out = graph.recursive_map(_set_mode_fn, node)
+
+  if raise_if_not_found:
+    set_mode_calls = counts.pop("_set_mode_calls")
+    unused_keys = [k for k, v in counts.items() if v == set_mode_calls]
+    if unused_keys:
+      raise ValueError(f"Unused keys found in set_mode: {unused_keys}")
+
+  return out
+
+def train_mode(node: A, /, *, only: filterlib.Filter = ..., **kwargs) -> A:
+  """Creates a new node set to training mode.
+
+  ``train_mode`` uses ``set_mode`` to recursively set attributes ``deterministic=False``
+  and ``use_running_average=False`` of all nested Modules that have these attributes.
+  Its primarily used to control the runtime behavior of the ``Dropout`` and ``BatchNorm``
+  Modules.
+
+  Example::
+    >>> from flax import nnx
+    ...
+    >>> class Block(nnx.Module):
+    ...   def __init__(self, din, dout, *, rngs: nnx.Rngs):
+    ...     self.linear = nnx.Linear(din, dout, rngs=rngs)
+    ...     # initialize Dropout and BatchNorm in eval mode
+    ...     self.dropout = nnx.Dropout(0.5, deterministic=True)
+    ...     self.batch_norm = nnx.BatchNorm(10, use_running_average=True, rngs=rngs)
+    ...
+    >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+    >>> block.dropout.deterministic, block.batch_norm.use_running_average
+    (True, True)
+    >>> train_block = nnx.train_mode(block)
+    >>> train_block.dropout.deterministic, train_block.batch_norm.use_running_average
+    (False, False)
+
+  Args:
+    **kwargs: additional attributes passed to ``set_attributes``.
+  """
+  return set_mode(
+      node,
+      only=only,
+      raise_if_not_found=False,
+      deterministic=False,
+      use_running_average=False,
+      **kwargs,
+  )
+
+def eval_mode(node: A, /, *, only: filterlib.Filter = ..., **kwargs) -> A:
+  """Creates a new node set to evaluation mode.
+
+  ``eval_mode`` uses ``set_mode`` to recursively set attributes ``deterministic=True``
+  and ``use_running_average=True`` of all nested Modules that have these attributes.
+  Its primarily used to control the runtime behavior of the ``Dropout`` and ``BatchNorm``
+  Modules.
+
+  Example::
+    >>> from flax import nnx
+    ...
+    >>> class Block(nnx.Module):
+    ...   def __init__(self, din, dout, *, rngs: nnx.Rngs):
+    ...     self.linear = nnx.Linear(din, dout, rngs=rngs)
+    ...     self.dropout = nnx.Dropout(0.5)
+    ...     self.batch_norm = nnx.BatchNorm(10, rngs=rngs)
+    ...
+    >>> block = Block(2, 5, rngs=nnx.Rngs(0))
+    >>> block.dropout.deterministic, block.batch_norm.use_running_average
+    (False, False)
+    >>> eval_block = nnx.eval_mode(block)
+    >>> eval_block.dropout.deterministic, eval_block.batch_norm.use_running_average
+    (True, True)
+
+  Args:
+    **kwargs: additional attributes passed to ``set_mode``.
+  """
+  return set_mode(
+      node,
+      only=only,
+      raise_if_not_found=False,
+      deterministic=True,
+      use_running_average=True,
+      **kwargs,
+  )
+
+
 
 def first_from(*args: tp.Optional[A], error_msg: str) -> A:
   """Return the first non-None argument.
 
@@ -638,6 +638,51 @@ def init_cache(self, input_shape: Shape, dtype: Dtype = jnp.float32):
     self.cached_value = nnx.Cache(jnp.zeros(cache_shape, dtype))
     self.cache_index = nnx.Cache(jnp.array(0, dtype=jnp.int32))
 
+  def set_mode(
+      self,
+      deterministic: bool | None = None,
+      decode: bool | None = None,
+      batch_size: int | Shape | None = None,
+      max_length: int | None = None,
+      **kwargs,
+  ) -> dict:
+    """Class method used by ``nnx.set_mode``.
+
+    Args:
+      train: if True, the module is set to training mode.
+      deterministic: if True, the module is set to deterministic mode.
+      decode: if True, the module is set to decode mode.
+      batch_size: the batch size to use for the cache.
+      max_length: the max length to use for the cache.
+    """
+    if deterministic is not None:
+      self.deterministic = deterministic
+
+    if decode is not None:
+      self.decode = decode
+      if (
+          not hasattr(self, 'cached_key')
+          or not hasattr(self, 'cached_value')
+          or not hasattr(self, 'cache_index')
+      ):
+        if batch_size is None:
+          raise TypeError(
+              "'batch_size' must be provided when initializing cache."
+          )
+        if max_length is None:
+          raise TypeError(
+              "'max_length' must be provided when initializing cache."
+          )
+        if isinstance(batch_size, int):
+          batch_size = (batch_size,)
+
+        # initialize cache
+        cache_shape = (*batch_size, max_length, self.num_heads, self.head_dim)
+        self.cached_key = nnx.Cache(jnp.zeros(cache_shape, self.dtype))
+        self.cached_value = nnx.Cache(jnp.zeros(cache_shape, self.dtype))
+        self.cache_index = nnx.Cache(jnp.array(0, dtype=jnp.int32))
+    return kwargs
+
 
 # mask-making utility functions
 
 
@@ -392,6 +392,21 @@ def __call__(
       self.epsilon,
     )
 
+  def set_mode(
+      self,
+      use_running_average: bool | None = None,
+      **kwargs,
+  ) -> dict:
+    """Class method used by ``nnx.set_mode``.
+
+    Args:
+      use_running_average: if True, the stored batch statistics will be
+        used instead of computing the batch statistics on the input.
+    """
+    if use_running_average is not None:
+      self.use_running_average = use_running_average
+    return kwargs
+
 
 class LayerNorm(Module):
   """Layer normalization (https://arxiv.org/abs/1607.06450).
 
@@ -153,3 +153,17 @@ def __call__(
     mask = random.bernoulli(key, p=keep_prob, shape=broadcast_shape)
     mask = jnp.broadcast_to(mask, inputs.shape)
     return lax.select(mask, inputs / keep_prob, jnp.zeros_like(inputs))
+
+  def set_mode(
+      self,
+      deterministic: bool | None = None,
+      **kwargs,
+  ) -> dict:
+    """Class method used by ``nnx.set_mode``.
+
+    Args:
+      deterministic: if True, disables dropout masking.
+    """
+    if deterministic is not None:
+      self.deterministic = deterministic
+    return kwargs