Remove dtensor (#1268)

mattdangerw · mattdangerw · commit 6882235b960f · 2023-11-07T14:32:42.000-08:00
We will replace this with the work on #1267 But we have no coverage for that PR till we run tests against Keras 3, which will probably still be about a week. For now, let's just remove this usage, which is no longer needed and will break a Keras 3 install.
diff --git a/keras_nlp/conftest.py b/keras_nlp/conftest.py
@@ -86,8 +86,3 @@ def pytest_collection_modifyitems(config, items):
 tf.debugging.disable_traceback_filtering()
 if backend_config.multi_backend():
     keras.config.disable_traceback_filtering()
-
-# One off setup for dtensor tests.
-if not backend_config.multi_backend():
-    keras.backend.experimental.enable_tf_random_generator()
-    keras.utils.set_random_seed(1337)
diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py
@@ -14,10 +14,6 @@
 
 import copy
 
-from tensorflow.experimental import dtensor
-from tensorflow.experimental.dtensor import Layout
-from tensorflow.keras.dtensor.experimental import LayoutMap
-
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
 from keras_nlp.layers.modeling.position_embedding import PositionEmbedding
@@ -191,71 +187,3 @@ def get_config(self):
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
-
-    @classmethod
-    def create_layout_map(cls, mesh):
-        """Create a DTensor layout map for a GPT2Backbone.
-
-        Given a DTensor mesh describing a list of devices, this method returns a
-        DTensor layout map for creating a `keras_nlp.models.GPT2Backbone`
-        instance. This mapping describes how to distribute all model weights
-        across multiple devices. For an overview of DTensor concepts, see
-        [this guide](https://www.tensorflow.org/guide/dtensor_overview).
-
-        Args:
-            mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement
-                of devices for running distributed computation. The
-                first dimension in the mesh is expected to be for data parallel
-                distribution, and the second for model parallel distribution.
-
-        Returns:
-            A `tf.keras.dtensor.experimental.LayoutMap` which contains the
-            proper layout to weights mapping for the model parallel setting.
-
-        Examples:
-        ```python
-        keras.backend.experimental.enable_tf_random_generator()
-        keras.utils.set_random_seed(1337)
-
-        # Update both dimensions below for a multi-device setting.
-        mesh = dtensor.create_mesh([("batch", 1), ("model", 1)])
-        layout_map = keras_nlp.models.GPT2Backbone.create_layout_map(mesh)
-
-        with layout_map.scope():
-            model = keras_nlp.models.GPT2Backbone.from_preset("gpt2_base_en")
-        ```
-        """
-        # We assert the mesh is 2D, and assume the first mesh dim is for data
-        # parallel and the second dim is for model parallel.
-        mesh_shape = mesh.shape()
-        if len(mesh_shape) != 2:
-            raise ValueError(
-                f"Expect to create layout based on 2D mesh, received {mesh}"
-            )
-        _, model_dim = mesh.dim_names
-        unshard_dim = dtensor.UNSHARDED
-
-        layout_map = LayoutMap(mesh=mesh)
-        # Embedding sharding
-        layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh)
-
-        # Transformer block sharding
-        layout_map[r".*_(query|key|value)_dense.kernel"] = Layout(
-            [unshard_dim, unshard_dim, model_dim], mesh
-        )
-        layout_map[r".*_(query|key|value)_dense.bias"] = Layout(
-            [model_dim, unshard_dim], mesh
-        )
-        layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout(
-            [unshard_dim, model_dim], mesh
-        )
-        layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout(
-            [model_dim], mesh
-        )
-        layout_map[r".*_feedforward_output_dense.kernel"] = Layout(
-            [model_dim, unshard_dim], mesh
-        )
-        layout_map[r".*_feedforward_output_dense.bias"] = Layout(
-            [unshard_dim], mesh
-        )
-        return layout_map
diff --git a/keras_nlp/models/gpt2/gpt2_backbone_test.py b/keras_nlp/models/gpt2/gpt2_backbone_test.py
@@ -84,20 +84,3 @@ def test_saved_model(self):
         # Check that output matches.
         restored_output = restored_model(self.input_batch)
         self.assertAllClose(model_output, restored_output)
-
-    def test_create_layout_map(self):
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        with GPT2Backbone.create_layout_map(mesh).scope():
-            GPT2Backbone(
-                vocabulary_size=10,
-                num_layers=2,
-                num_heads=2,
-                hidden_dim=2,
-                intermediate_dim=4,
-                max_sequence_length=5,
-            )
-        # Using DTensor enables the mlir bridge as a side effect. Eventually
-        # this will be default, but for now we have compile errors with the
-        # bridge elsewhere and must disable. See
-        # https://github.com/keras-team/keras-nlp/issues/1001
-        tf.config.experimental.disable_mlir_bridge()
diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py
@@ -325,39 +325,3 @@ def next(prompt, cache, index):
             "token_ids": token_ids,
             "padding_mask": padding_mask,
         }
-
-    @classmethod
-    def create_layout_map(cls, mesh):
-        """Create a DTensor layout map for a GPT2CausalLM.
-
-        Given a DTensor mesh describing a list of devices, this method returns a
-        DTensor layout map for creating a `keras_nlp.models.GPT2CausalLM`
-        instance. This mapping describes how to distribute all model weights
-        across multiple devices. For an overview of DTensor concepts, see
-        [this guide](https://www.tensorflow.org/guide/dtensor_overview).
-
-        Args:
-            mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement
-                of devices for running distributed computation. The
-                first dimension in the mesh is expected to be for data parallel
-                distribution, and the second for model parallel distribution.
-
-        Returns:
-            A `keras.dtensor.experimental.LayoutMap` which contains the
-            proper layout to weights mapping for the model parallel setting.
-
-        Examples:
-        ```python
-        keras.backend.experimental.enable_tf_random_generator()
-        keras.utils.set_random_seed(1337)
-
-        # Update both dimensions below for a multi-device setting.
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        layout_map = keras_nlp.models.GPT2CausalLM.create_layout_map(mesh)
-
-        with layout_map.scope():
-            gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en")
-        ```
-        """
-        # As this task has no new variables, we just re-use the backbone method.
-        return cls.backbone_cls.create_layout_map(mesh)
diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py
@@ -165,13 +165,3 @@ def test_saved_model(self):
         keras.utils.set_random_seed(42)
         restored_output = restored_model.predict(self.raw_batch)
         self.assertAllClose(model_output, restored_output)
-
-    def test_create_layout_map(self):
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        with GPT2CausalLM.create_layout_map(mesh).scope():
-            GPT2CausalLM(backbone=self.backbone)
-        # Using DTensor enables the mlir bridge as a side effect. Eventually
-        # this will be default, but for now we have compile errors with the
-        # bridge elsewhere and must disable. See
-        # https://github.com/keras-team/keras-nlp/issues/1001
-        tf.config.experimental.disable_mlir_bridge()
diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py
@@ -14,10 +14,6 @@
 
 import copy
 
-from tensorflow.experimental import dtensor
-from tensorflow.experimental.dtensor import Layout
-from tensorflow.keras.dtensor.experimental import LayoutMap
-
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
 from keras_nlp.layers.modeling.token_and_position_embedding import (
@@ -168,71 +164,3 @@ def get_config(self):
     @classproperty
     def presets(cls):
         return copy.deepcopy(backbone_presets)
-
-    @classmethod
-    def create_layout_map(cls, mesh):
-        """Create a DTensor layout map for an OPTBackbone.
-
-        Given a DTensor mesh describing a list of devices, this method returns a
-        DTensor layout map for creating a `keras_nlp.models.OPTBackbone`
-        instance. This mapping describes how to distribute all model weights
-        across multiple devices. For an overview of DTensor concepts, see
-        [this guide](https://www.tensorflow.org/guide/dtensor_overview).
-
-        Args:
-            mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement
-                of devices for running distributed computation. The
-                first dimension in the mesh is expected to be for data parallel
-                distribution, and the second for model parallel distribution.
-
-        Returns:
-            A `tf.keras.dtensor.experimental.LayoutMap` which contains the
-            proper layout to weights mapping for the model parallel setting.
-
-        Examples:
-        ```python
-        keras.backend.experimental.enable_tf_random_generator()
-        keras.utils.set_random_seed(1337)
-
-        # Update both dimensions below for a multi-device setting.
-        mesh = dtensor.create_mesh([("batch", 1), ("model", 1)])
-        layout_map = keras_nlp.models.OPTBackbone.create_layout_map(mesh)
-
-        with layout_map.scope():
-            model = keras_nlp.models.OPTBackbone.from_preset("opt_125m_en")
-        ```
-        """
-        # We assert the mesh is 2D, and assume the first mesh dim is for data
-        # parallel and the second dim is for model parallel.
-        mesh_shape = mesh.shape()
-        if len(mesh_shape) != 2:
-            raise ValueError(
-                f"Expect to create layout based on 2D mesh, received {mesh}"
-            )
-        _, model_dim = mesh.dim_names
-        unshard_dim = dtensor.UNSHARDED
-
-        layout_map = LayoutMap(mesh=mesh)
-        # Embedding sharding
-        layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh)
-
-        # Transformer block sharding
-        layout_map[r".*_(query|key|value)_dense.kernel"] = Layout(
-            [unshard_dim, unshard_dim, model_dim], mesh
-        )
-        layout_map[r".*_(query|key|value)_dense.bias"] = Layout(
-            [model_dim, unshard_dim], mesh
-        )
-        layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout(
-            [unshard_dim, model_dim], mesh
-        )
-        layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout(
-            [model_dim], mesh
-        )
-        layout_map[r".*_feedforward_output_dense.kernel"] = Layout(
-            [model_dim, unshard_dim], mesh
-        )
-        layout_map[r".*_feedforward_output_dense.bias"] = Layout(
-            [unshard_dim], mesh
-        )
-        return layout_map
diff --git a/keras_nlp/models/opt/opt_backbone_test.py b/keras_nlp/models/opt/opt_backbone_test.py
@@ -84,20 +84,3 @@ def test_saved_model(self):
         # Check that output matches.
         restored_output = restored_model(self.input_batch)
         self.assertAllClose(model_output, restored_output)
-
-    def test_create_layout_map(self):
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        with OPTBackbone.create_layout_map(mesh).scope():
-            OPTBackbone(
-                vocabulary_size=10,
-                num_layers=2,
-                num_heads=2,
-                hidden_dim=2,
-                intermediate_dim=4,
-                max_sequence_length=5,
-            )
-        # Using DTensor enables the mlir bridge as a side effect. Eventually
-        # this will be default, but for now we have compile errors with the
-        # bridge elsewhere and must disable. See
-        # https://github.com/keras-team/keras-nlp/issues/1001
-        tf.config.experimental.disable_mlir_bridge()
diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py
@@ -321,39 +321,3 @@ def next(prompt, cache, index):
             "token_ids": token_ids,
             "padding_mask": padding_mask,
         }
-
-    @classmethod
-    def create_layout_map(cls, mesh):
-        """Create a DTensor layout map for an OPTCausalLM.
-
-        Given a DTensor mesh describing a list of devices, this method returns a
-        DTensor layout map for creating a `keras_nlp.models.OPTCausalLM`
-        instance. This mapping describes how to distribute all model weights
-        across multiple devices. For an overview of DTensor concepts, see
-        [this guide](https://www.tensorflow.org/guide/dtensor_overview).
-
-        Args:
-            mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement
-                of devices for running distributed computation. The
-                first dimension in the mesh is expected to be for data parallel
-                distribution, and the second for model parallel distribution.
-
-        Returns:
-            A `tf.keras.dtensor.experimental.LayoutMap` which contains the
-            proper layout to weights mapping for the model parallel setting.
-
-        Examples:
-        ```python
-        keras.backend.experimental.enable_tf_random_generator()
-        keras.utils.set_random_seed(1337)
-
-        # Update both dimensions below for a multi-device setting.
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        layout_map = keras_nlp.models.OPTCausalLM.create_layout_map(mesh)
-
-        with layout_map.scope():
-            opt_lm = keras_nlp.models.OPTCausalLM.from_preset("opt_125m_en")
-        ```
-        """
-        # As this task has no new variables, we just re-use the backbone method.
-        return cls.backbone_cls.create_layout_map(mesh)
diff --git a/keras_nlp/models/opt/opt_causal_lm_test.py b/keras_nlp/models/opt/opt_causal_lm_test.py
@@ -171,13 +171,3 @@ def test_saved_model(self):
         keras.utils.set_random_seed(42)
         restored_output = restored_model.predict(self.raw_batch)
         self.assertAllClose(model_output, restored_output)
-
-    def test_create_layout_map(self):
-        mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)])
-        with OPTCausalLM.create_layout_map(mesh).scope():
-            OPTCausalLM(backbone=self.backbone)
-        # Using DTensor enables the mlir bridge as a side effect. Eventually
-        # this will be default, but for now we have compile errors with the
-        # bridge elsewhere and must disable. See
-        # https://github.com/keras-team/keras-nlp/issues/1001
-        tf.config.experimental.disable_mlir_bridge()