google
diff --git a/‎docs_nnx/api_reference/flax.nnx/nn/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs_nnx/api_reference/flax.nnx/nn/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs_nnx/api_reference/flax.nnx/nn/pooling.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs_nnx/api_reference/flax.nnx/nn/pooling.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs_nnx/guides/randomness.ipynb‎
Lines changed: 309 additions & 201 deletions b/‎docs_nnx/guides/randomness.ipynb‎
Lines changed: 309 additions & 201 deletions
diff --git a/‎docs_nnx/guides/randomness.md‎
Lines changed: 135 additions & 90 deletions b/‎docs_nnx/guides/randomness.md‎
Lines changed: 135 additions & 90 deletions
diff --git a/‎docs_nnx/index.rst‎
Lines changed: 5 additions & 5 deletions b/‎docs_nnx/index.rst‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs_nnx/mnist_tutorial.ipynb‎
Lines changed: 46 additions & 169 deletions b/‎docs_nnx/mnist_tutorial.ipynb‎
Lines changed: 46 additions & 169 deletions
diff --git a/‎docs_nnx/mnist_tutorial.md‎
Lines changed: 28 additions & 25 deletions b/‎docs_nnx/mnist_tutorial.md‎
Lines changed: 28 additions & 25 deletions
diff --git a/‎docs_nnx/nnx_basics.ipynb‎
Lines changed: 67 additions & 86 deletions b/‎docs_nnx/nnx_basics.ipynb‎
Lines changed: 67 additions & 86 deletions
diff --git a/‎docs_nnx/nnx_basics.md‎
Lines changed: 25 additions & 25 deletions b/‎docs_nnx/nnx_basics.md‎
Lines changed: 25 additions & 25 deletions
diff --git a/‎flax/core/nn/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎flax/core/nn/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -14,6 +14,7 @@ See the `NNX page <https://flax.readthedocs.io/en/latest/nnx/index.html>`__ for
   linear
   lora
   normalization
+  pooling
   recurrent
   stochastic
 
@@ -0,0 +1,10 @@
+Pooling
+------------------------
+
+.. automodule:: flax.nnx
+.. currentmodule:: flax.nnx
+
+.. autofunction:: avg_pool
+.. autofunction:: max_pool
+.. autofunction:: min_pool
+.. autofunction:: pool
@@ -97,20 +97,20 @@ Basic usage
      def __init__(self, din, dmid, dout, rngs: nnx.Rngs):
        self.linear = nnx.Linear(din, dmid, rngs=rngs)
        self.bn = nnx.BatchNorm(dmid, rngs=rngs)
-       self.dropout = nnx.Dropout(0.2, rngs=rngs)
+       self.dropout = nnx.Dropout(0.2)
        self.linear_out = nnx.Linear(dmid, dout, rngs=rngs)
 
-     def __call__(self, x):
-       x = nnx.relu(self.dropout(self.bn(self.linear(x))))
+     def __call__(self, x, rngs):
+       x = nnx.relu(self.dropout(self.bn(self.linear(x)), rngs=rngs))
        return self.linear_out(x)
 
    model = Model(2, 64, 3, rngs=nnx.Rngs(0))  # eager initialization
    optimizer = nnx.Optimizer(model, optax.adam(1e-3), wrt=nnx.Param)
 
    @nnx.jit  # automatic state management for JAX transforms
-   def train_step(model, optimizer, x, y):
+   def train_step(model, optimizer, x, y, rngs):
      def loss_fn(model):
-       y_pred = model(x)  # call methods directly
+       y_pred = model(x, rngs)  # call methods directly
        return ((y_pred - y) ** 2).mean()
 
      loss, grads = nnx.value_and_grad(loss_fn)(model)
 
@@ -26,15 +26,15 @@ Let’s get started!
 
 If `flax` is not installed in your Python environment, use `pip` to install the package from PyPI (below, just uncomment the code in the cell if you are working from Google Colab/Jupyter Notebook):
 
-```{code-cell}
+```{code-cell} ipython3
 # !pip install flax
 ```
 
 ## 2. Load the MNIST dataset
 
 First, you need to load the MNIST dataset and then prepare the training and testing sets via Tensorflow Datasets (TFDS). You normalize image values, shuffle the data and divide it into batches, and prefetch samples to enhance performance.
 
-```{code-cell}
+```{code-cell} ipython3
 import tensorflow_datasets as tfds  # TFDS to download MNIST.
 import tensorflow as tf  # TensorFlow / `tf.data` operations.
 
@@ -72,29 +72,30 @@ test_ds = test_ds.batch(batch_size, drop_remainder=True).prefetch(1)
 
 Create a CNN for classification with Flax NNX by subclassing `nnx.Module`:
 
-```{code-cell}
+```{code-cell} ipython3
 from flax import nnx  # The Flax NNX API.
 from functools import partial
+from typing import Optional
 
 class CNN(nnx.Module):
   """A simple CNN model."""
 
   def __init__(self, *, rngs: nnx.Rngs):
     self.conv1 = nnx.Conv(1, 32, kernel_size=(3, 3), rngs=rngs)
     self.batch_norm1 = nnx.BatchNorm(32, rngs=rngs)
-    self.dropout1 = nnx.Dropout(rate=0.025, rngs=rngs)
+    self.dropout1 = nnx.Dropout(rate=0.025)
     self.conv2 = nnx.Conv(32, 64, kernel_size=(3, 3), rngs=rngs)
     self.batch_norm2 = nnx.BatchNorm(64, rngs=rngs)
     self.avg_pool = partial(nnx.avg_pool, window_shape=(2, 2), strides=(2, 2))
     self.linear1 = nnx.Linear(3136, 256, rngs=rngs)
-    self.dropout2 = nnx.Dropout(rate=0.025, rngs=rngs)
+    self.dropout2 = nnx.Dropout(rate=0.025)
     self.linear2 = nnx.Linear(256, 10, rngs=rngs)
 
-  def __call__(self, x):
-    x = self.avg_pool(nnx.relu(self.batch_norm1(self.dropout1(self.conv1(x)))))
+  def __call__(self, x, rngs: Optional[nnx.Rngs] = None):
+    x = self.avg_pool(nnx.relu(self.batch_norm1(self.dropout1(self.conv1(x), rngs=rngs))))
     x = self.avg_pool(nnx.relu(self.batch_norm2(self.conv2(x))))
     x = x.reshape(x.shape[0], -1)  # flatten
-    x = nnx.relu(self.dropout2(self.linear1(x)))
+    x = nnx.relu(self.dropout2(self.linear1(x), rngs=rngs))
     x = self.linear2(x)
     return x
 
@@ -108,18 +109,18 @@ nnx.display(model)
 
 Let's put the CNN model to the test!  Here, you’ll perform a forward pass with arbitrary data and print the results.
 
-```{code-cell}
+```{code-cell} ipython3
 import jax.numpy as jnp  # JAX NumPy
 
-y = model(jnp.ones((1, 28, 28, 1)))
+y = model(jnp.ones((1, 28, 28, 1)), nnx.Rngs(0))
 y
 ```
 
 ## 4. Create the optimizer and define some metrics
 
 In Flax NNX, you need to create an `nnx.Optimizer` object to manage the model's parameters and apply gradients during training. `nnx.Optimizer` receives the model's reference, so that it can update its parameters, and an [Optax](https://optax.readthedocs.io/) optimizer to define the update rules. Additionally, you will define an `nnx.MultiMetric` object to keep track of the `Accuracy` and the `Average` loss.
 
-```{code-cell}
+```{code-cell} ipython3
 import optax
 
 learning_rate = 0.005
@@ -144,31 +145,31 @@ In addition to the `loss`, during training and testing you will also get the `lo
 
 During training - the `train_step` - you will use `nnx.value_and_grad` to compute the gradients and update the model's parameters using the `optimizer` you have already defined. And during both training and testing (the `eval_step`), the `loss` and `logits` will be used to calculate the metrics.
 
-```{code-cell}
-def loss_fn(model: CNN, batch):
-  logits = model(batch['image'])
+```{code-cell} ipython3
+def loss_fn(model: CNN, rngs: nnx.Rngs, batch):
+  logits = model(batch['image'], rngs)
   loss = optax.softmax_cross_entropy_with_integer_labels(
     logits=logits, labels=batch['label']
   ).mean()
   return loss, logits
 
 @nnx.jit
-def train_step(model: CNN, optimizer: nnx.Optimizer, metrics: nnx.MultiMetric, batch):
+def train_step(model: CNN, optimizer: nnx.Optimizer, metrics: nnx.MultiMetric, rngs: nnx.Rngs, batch):
   """Train for a single step."""
   grad_fn = nnx.value_and_grad(loss_fn, has_aux=True)
-  (loss, logits), grads = grad_fn(model, batch)
+  (loss, logits), grads = grad_fn(model, rngs, batch)
   metrics.update(loss=loss, logits=logits, labels=batch['label'])  # In-place updates.
-  optimizer.update(grads)  # In-place updates.
+  optimizer.update(model, grads)  # In-place updates.
 
 @nnx.jit
-def eval_step(model: CNN, metrics: nnx.MultiMetric, batch):
-  loss, logits = loss_fn(model, batch)
+def eval_step(model: CNN, metrics: nnx.MultiMetric, rngs: nnx.Rngs, batch):
+  loss, logits = loss_fn(model, rngs, batch)
   metrics.update(loss=loss, logits=logits, labels=batch['label'])  # In-place updates.
 ```
 
 In the code above, the [`nnx.jit`](https://flax.readthedocs.io/en/latest/api_reference/flax.nnx/transforms.html#flax.nnx.jit) transformation decorator traces the `train_step` function for just-in-time compilation with [XLA](https://www.tensorflow.org/xla), optimizing performance on hardware accelerators, such as Google TPUs and GPUs. `nnx.jit` is a "lifted" version of the `jax.jit` transform that allows its function input and outputs to be Flax NNX objects. Similarly, `nnx.value_and_grad ` is a lifted version of `jax.value_and_grad `. Check out [the lifted transforms guide](https://flax.readthedocs.io/en/latest/guides/transforms.html) to learn more.
 
-> **Note:** The code shows how to perform several in-place updates to the model, the optimizer, and the metrics, but _state updates_ were not explicitly returned. This is because Flax NNX transformations respect _reference semantics_ for Flax NNX objects, and will propagate the state updates of the objects passed as input arguments. This is a key feature of Flax NNX that allows for a more concise and readable code. You can learn more in [Why Flax NNX](https://flax.readthedocs.io/en/latest/why.html).
+> **Note:** The code shows how to perform several in-place updates to the model, the optimizer, the RNG streams and the metrics, but _state updates_ were not explicitly returned. This is because Flax NNX transformations respect _reference semantics_ for Flax NNX objects, and will propagate the state updates of the objects passed as input arguments. This is a key feature of Flax NNX that allows for a more concise and readable code. You can learn more in [Why Flax NNX](https://flax.readthedocs.io/en/latest/why.html).
 
 
 ## 6. Train and evaluate the model
@@ -177,7 +178,7 @@ Now, you can train the CNN model using batches of data for 10 epochs, evaluate t
 on the test set after each epoch, and log the training and testing metrics (the loss and
 the accuracy) during the process. Typically this leads to the model achieving around 99% accuracy.
 
-```{code-cell}
+```{code-cell} ipython3
 from IPython.display import clear_output
 import matplotlib.pyplot as plt
 
@@ -188,13 +189,15 @@ metrics_history = {
   'test_accuracy': [],
 }
 
+rngs = nnx.Rngs(0)
+
 for step, batch in enumerate(train_ds.as_numpy_iterator()):
   # Run the optimization for one step and make a stateful update to the following:
   # - The train state's model parameters
   # - The optimizer state
   # - The training loss and accuracy batch metrics
   model.train() # Switch to train mode
-  train_step(model, optimizer, metrics, batch)
+  train_step(model, optimizer, metrics, rngs, batch)
 
   if step > 0 and (step % eval_every == 0 or step == train_steps - 1):  # One training epoch has passed.
     # Log the training metrics.
@@ -205,7 +208,7 @@ for step, batch in enumerate(train_ds.as_numpy_iterator()):
     # Compute the metrics on the test set after each training epoch.
     model.eval() # Switch to eval mode
     for test_batch in test_ds.as_numpy_iterator():
-      eval_step(model, metrics, test_batch)
+      eval_step(model, metrics, rngs, test_batch)
 
     # Log the test metrics.
     for metric, value in metrics.compute().items():
@@ -229,7 +232,7 @@ for step, batch in enumerate(train_ds.as_numpy_iterator()):
 
 Create a `jit`-compiled model inference function (with `nnx.jit`) - `pred_step` - to generate predictions on the test set using the learned model parameters. This will enable you to visualize test images alongside their predicted labels for a qualitative assessment of model performance.
 
-```{code-cell}
+```{code-cell} ipython3
 model.eval() # Switch to evaluation mode.
 
 @nnx.jit
@@ -240,7 +243,7 @@ def pred_step(model: CNN, batch):
 
 We call .eval() before inference so Dropout is disabled and BatchNorm uses stored running stats. It is used during inference to suppress gradients and ensure deterministic, resource-efficient output.
 
-```{code-cell}
+```{code-cell} ipython3
 test_batch = test_ds.as_numpy_iterator().next()
 pred = pred_step(model, test_batch)
 
 
@@ -90,31 +90,27 @@ to handle them, as demonstrated in later sections of this guide.
 
 Flax `Module`s can be used to compose other Modules in a nested structure. These can be assigned directly as attributes, or inside an attribute of any (nested) pytree type, such as a `list`, `dict`, `tuple`, and so on.
 
-The example below shows how to define a simple `MLP` by subclassing `Module`. The model consists of two `Linear` layers, a `Dropout` layer, and a `BatchNorm` layer:
+The example below shows how to define a simple `MLP` by subclassing `Module`. The model consists of two `Linear` layers, a `Dropout` layer, and a `BatchNorm` layer. Note that we need to pass the `__call__` method the RNG state that we want the `Dropout` layer to use.
 
 ```{code-cell} ipython3
 class MLP(nnx.Module):
   def __init__(self, din: int, dmid: int, dout: int, *, rngs: nnx.Rngs):
     self.linear1 = Linear(din, dmid, rngs=rngs)
-    self.dropout = nnx.Dropout(rate=0.1, rngs=rngs)
+    self.dropout = nnx.Dropout(rate=0.1)
     self.bn = nnx.BatchNorm(dmid, rngs=rngs)
     self.linear2 = Linear(dmid, dout, rngs=rngs)
 
-  def __call__(self, x: jax.Array):
-    x = nnx.gelu(self.dropout(self.bn(self.linear1(x))))
+  def __call__(self, x: jax.Array, rngs: nnx.Rngs):
+    x = nnx.gelu(self.dropout(self.bn(self.linear1(x)), rngs=rngs))
     return self.linear2(x)
 
 model = MLP(2, 16, 5, rngs=nnx.Rngs(0))
 
-y = model(x=jnp.ones((3, 2)))
+y = model(x=jnp.ones((3, 2)), rngs=nnx.Rngs(1))
 
 nnx.display(model)
 ```
 
-In Flax, `Dropout` is a stateful module that stores an `Rngs` object, so that it can generate new masks during the forward pass without the need for the user to pass a new key each time.
-
-+++
-
 ### Model surgery
 
 Flax `Module`s are mutable by default. This means that their structure can be changed at any time, which makes [model surgery](https://flax.readthedocs.io/en/latest/guides/surgery.html) quite easy, as any sub-Module attribute can be replaced with anything else, such as new Modules, existing shared Modules, Modules of different types, and so on. Moreover, `Variable`s can also be modified or replaced/shared.
@@ -140,7 +136,7 @@ model = MLP(2, 32, 5, rngs=rngs)
 model.linear1 = LoraLinear(model.linear1, 4, rngs=rngs)
 model.linear2 = LoraLinear(model.linear2, 4, rngs=rngs)
 
-y = model(x=jnp.ones((3, 2)))
+y = model(x=jnp.ones((3, 2)), rngs=rngs)
 
 nnx.display(model)
 ```
@@ -161,18 +157,18 @@ model = MLP(2, 16, 10, rngs=nnx.Rngs(0))
 optimizer = nnx.Optimizer(model, optax.adam(1e-3), wrt=nnx.Param)
 
 @nnx.jit  # Automatic state management
-def train_step(model, optimizer, x, y):
-  def loss_fn(model: MLP):
-    y_pred = model(x)
+def train_step(model, optimizer, x, y, rngs):
+  def loss_fn(model: MLP, rngs: nnx.Rngs):
+    y_pred = model(x, rngs)
     return jnp.mean((y_pred - y) ** 2)
 
-  loss, grads = nnx.value_and_grad(loss_fn)(model)
+  loss, grads = nnx.value_and_grad(loss_fn)(model, rngs)
   optimizer.update(model, grads)  # In place updates.
 
   return loss
 
 x, y = jnp.ones((5, 2)), jnp.ones((5, 10))
-loss = train_step(model, optimizer, x, y)
+loss = train_step(model, optimizer, x, y, rngs)
 
 print(f'{loss = }')
 print(f'{optimizer.step.value = }')
@@ -194,23 +190,27 @@ In the code below notice the following:
 1. The custom `create_model` function takes in a key and returns an `MLP` object, since you create five keys and use `nnx.vmap` over `create_model` a stack of 5 `MLP` objects is created.
 2. The `nnx.scan` is used to iteratively apply each `MLP` in the stack to the input `x`.
 3. The nnx.scan (consciously) deviates from `jax.lax.scan` and instead mimics nnx.vmap, which is more expressive. nnx.scan allows specifying multiple inputs, the scan axes of each input/output, and the position of the carry.
-4. `State` updates for the `BatchNorm` and `Dropout` layers are automatically propagated by nnx.scan.
+4. `State` updates for `BatchNorm` layers are automatically propagated by nnx.scan.
+5. The `rngs` object is split into separate streams for each layer using the `fork` method.
 
 ```{code-cell} ipython3
 @nnx.vmap(in_axes=0, out_axes=0)
-def create_model(key: jax.Array):
-  return MLP(10, 32, 10, rngs=nnx.Rngs(key))
+def create_model(rngs):
+  return MLP(10, 32, 10, rngs=rngs)
 
-keys = jax.random.split(jax.random.key(0), 5)
-model = create_model(keys)
-
-@nnx.scan(in_axes=(0, nnx.Carry), out_axes=nnx.Carry)
-def forward(model: MLP, x):
-  x = model(x)
+@nnx.scan(in_axes=(0, 0, nnx.Carry), out_axes=nnx.Carry)
+def forward(model: MLP, rngs: nnx.Rngs, x):
+  x = model(x, rngs)
   return x
+    
+param_rngs = nnx.Rngs(0).fork(split=5)
+model = create_model(param_rngs)
+```
 
+```{code-cell} ipython3
 x = jnp.ones((3, 10))
-y = forward(model, x)
+dropout_rngs = nnx.Rngs(1).fork(split=5)
+y = forward(model, dropout_rngs, x)
 
 print(f'{y.shape = }')
 nnx.display(model)
 
@@ -35,7 +35,7 @@
     swish as swish,
     tanh as tanh,
 )
-from flax.linen.pooling import (avg_pool as avg_pool, max_pool as max_pool)
+from flax.pooling import (avg_pool as avg_pool, max_pool as max_pool)
 from .attention import (
     dot_product_attention as dot_product_attention,
     multi_head_dot_product_attention as multi_head_dot_product_attention,
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`swish as swish,`
`36`	`36`	`tanh as tanh,`
`37`	`37`	`)`
`38`		`-from flax.linen.pooling import (avg_pool as avg_pool, max_pool as max_pool)`
	`38`	`+from flax.pooling import (avg_pool as avg_pool, max_pool as max_pool)`
`39`	`39`	`from .attention import (`
`40`	`40`	`dot_product_attention as dot_product_attention,`
`41`	`41`	`multi_head_dot_product_attention as multi_head_dot_product_attention,`