Add script and readme for hive lineage (#1293)

codelixir · web-flow · commit 0b3165c34e18 · 2025-02-03T12:16:00.000+05:30
diff --git a/BUILD b/BUILD
@@ -6,8 +6,8 @@ test_suite(
         ":test_cloud_sql_proxy",
         ":test_dr_elephant",
         ":test_hive_hcatalog",
-        ":test_starburst_presto",
         ":test_spark_rapids",
+        ":test_starburst_presto",
         "//alluxio:test_alluxio",
         "//atlas:test_atlas",
         "//bigtable:test_bigtable",
@@ -60,7 +60,10 @@ py_test(
     name = "test_cloud_sql_proxy",
     size = "enormous",
     srcs = ["cloud-sql-proxy/test_cloud_sql_proxy.py"],
-    data = ["cloud-sql-proxy/cloud-sql-proxy.sh", "cloud-sql-proxy/hivetest.hive"],
+    data = [
+        "cloud-sql-proxy/cloud-sql-proxy.sh",
+        "cloud-sql-proxy/hivetest.hive",
+    ],
     local = True,
     shard_count = 4,
     deps = [
@@ -114,10 +117,10 @@ py_test(
     size = "enormous",
     srcs = ["spark-rapids/test_spark_rapids.py"],
     data = [
+        "spark-rapids/mig.sh",
         "spark-rapids/spark-rapids.sh",
         "spark-rapids/verify_xgboost_spark_rapids.scala",
         "spark-rapids/verify_xgboost_spark_rapids_sql.scala",
-        "spark-rapids/mig.sh",
     ],
     local = True,
     shard_count = 3,
@@ -132,3 +135,19 @@ py_library(
     testonly = True,
     srcs = ["cloud-sql-proxy/pyspark_metastore_test.py"],
 )
+
+py_test(
+    name = "test_hive_lineage",
+    size = "enormous",
+    srcs = ["hive-lineage/test_hive_lineage.py"],
+    data = [
+        "hive-lineage/hive-lineage.sh",
+        "hive-lineage/hivetest.hive",
+    ],
+    local = True,
+    shard_count = 3,
+    deps = [
+        "//integration_tests:dataproc_test_case",
+        "@io_abseil_py//absl/testing:parameterized",
+    ],
+)
diff --git a/hive-lineage/README.md b/hive-lineage/README.md
@@ -0,0 +1,32 @@
+# Hive Lineage Initialization Action
+
+## Using the initialization action
+
+**:warning: NOTICE:** See
+[best practices](/README.md#how-initialization-actions-are-used) of using
+initialization actions in production.
+
+You can use this initialization action to create a new Dataproc cluster with Lineage enabled for Hive jobs.
+Note that this feature is in preview for now.
+
+```shell 
+REGION=<region>
+CLUSTER_NAME=<cluster_name>
+gcloud dataproc clusters create ${CLUSTER_NAME} \
+  --region ${REGION} \
+  --scopes cloud-platform \
+  --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh
+```
+
+If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well.
+See [connectors](../connectors/README.md) init action.
+
+```shell 
+REGION=<region>
+CLUSTER_NAME=<cluster_name>
+gcloud dataproc clusters create ${CLUSTER_NAME} \
+  --region ${REGION} \
+  --scopes cloud-platform \
+  --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh \
+  --metadata hive-bigquery-connector-version=2.0.3
+```
diff --git a/hive-lineage/__init__.py b/hive-lineage/__init__.py
@@ -0,0 +1 @@
+# required for integration tests
diff --git a/hive-lineage/hive-lineage.sh b/hive-lineage/hive-lineage.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Copyright 2025 Google LLC and contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#   This initialization script installs the required
+#   jars and sets the hive conf to enable lineage.
+
+set -euxo pipefail
+
+function prepare_env() {
+  export HIVE_HOME="/usr/lib/hive"
+  export HIVE_CONF_DIR="/etc/hive/conf"
+  export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml"
+  export HIVE_LIB_DIR="$HIVE_HOME/lib"
+  export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage"
+  export HIVE_OL_HOOK_VERSION="1.0.0-preview"
+  export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook"
+}
+
+function set_hive_lineage_conf() {
+  declare -A properties=(
+    ["hive.exec.post.hooks"]="$HIVE_OL_HOOK"
+    ["hive.exec.failure.hooks"]="$HIVE_OL_HOOK"
+    ["hive.openlineage.transport.type"]="gcplineage"
+    ["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace
+  )
+  echo "Setting hive conf to enable lineage"
+  for key in "${!properties[@]}"; do
+    bdconfig set_property \
+      --configuration_file="$HIVE_CONF_FILE" \
+      --name "$key" \
+      --value "${properties[$key]}"
+  done
+}
+
+function install_jars() {
+  echo "Installing openlineage-hive hook"
+  gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar"
+}
+
+function restart_hive_server2_master() {
+  ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role)
+  if [[ "${ROLE}" == 'Master' ]]; then
+    echo "Restarting hive-server2"
+    sudo systemctl restart hive-server2.service
+  fi
+}
+
+prepare_env
+install_jars
+set_hive_lineage_conf
+restart_hive_server2_master
diff --git a/hive-lineage/hivetest.hive b/hive-lineage/hivetest.hive
@@ -0,0 +1,23 @@
+DROP TABLE IF EXISTS validate_hive_tbl;
+DROP TABLE IF EXISTS grouped_tbl;
+
+CREATE EXTERNAL TABLE validate_hive_tbl (
+  shell_user STRING,
+  dummy STRING,
+  uid INT,
+  gid INT,
+  name STRING,
+  home STRING,
+  shell STRING
+)
+ROW FORMAT DELIMITED
+    FIELDS TERMINATED BY ':';
+
+CREATE TABLE grouped_tbl
+  ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
+AS SELECT shell, COUNT(*) shell_count
+  FROM validate_hive_tbl
+  GROUP BY shell
+  ORDER BY shell_count DESC, shell DESC;
+
+SELECT * from grouped_tbl;
diff --git a/hive-lineage/test_hive_lineage.py b/hive-lineage/test_hive_lineage.py
@@ -0,0 +1,30 @@
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from integration_tests.dataproc_test_case import DataprocTestCase
+
+class HiveLineageTestCase(DataprocTestCase):
+  COMPONENT = "hive-lineage"
+  INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"]
+  TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive"
+
+  def __submit_hive_job(self, cluster_name):
+    self.assert_dataproc_job(
+        cluster_name, 'hive', '--file={}/{}'.format(self.INIT_ACTIONS_REPO,
+                                                    self.TEST_SCRIPT_FILE))
+  def verify_cluster(self, name):
+    self.__submit_hive_job(name)
+
+  @parameterized.parameters(
+      'STANDARD',
+      'HA',
+  )
+  def test_hive_job_success(self, configuration):
+    self.createCluster(configuration,
+                       self.INIT_ACTIONS,
+                       scopes='cloud-platform')
+    self.verify_cluster(self.getClusterName())
+
+
+if __name__ == "__main__":
+  absltest.main()