Skip to content

Commit 0b3165c

Browse files
authored
Add script and readme for hive lineage (#1293)
1 parent 75594a7 commit 0b3165c

File tree

6 files changed

+172
-3
lines changed

6 files changed

+172
-3
lines changed

BUILD

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ test_suite(
66
":test_cloud_sql_proxy",
77
":test_dr_elephant",
88
":test_hive_hcatalog",
9-
":test_starburst_presto",
109
":test_spark_rapids",
10+
":test_starburst_presto",
1111
"//alluxio:test_alluxio",
1212
"//atlas:test_atlas",
1313
"//bigtable:test_bigtable",
@@ -60,7 +60,10 @@ py_test(
6060
name = "test_cloud_sql_proxy",
6161
size = "enormous",
6262
srcs = ["cloud-sql-proxy/test_cloud_sql_proxy.py"],
63-
data = ["cloud-sql-proxy/cloud-sql-proxy.sh", "cloud-sql-proxy/hivetest.hive"],
63+
data = [
64+
"cloud-sql-proxy/cloud-sql-proxy.sh",
65+
"cloud-sql-proxy/hivetest.hive",
66+
],
6467
local = True,
6568
shard_count = 4,
6669
deps = [
@@ -114,10 +117,10 @@ py_test(
114117
size = "enormous",
115118
srcs = ["spark-rapids/test_spark_rapids.py"],
116119
data = [
120+
"spark-rapids/mig.sh",
117121
"spark-rapids/spark-rapids.sh",
118122
"spark-rapids/verify_xgboost_spark_rapids.scala",
119123
"spark-rapids/verify_xgboost_spark_rapids_sql.scala",
120-
"spark-rapids/mig.sh",
121124
],
122125
local = True,
123126
shard_count = 3,
@@ -132,3 +135,19 @@ py_library(
132135
testonly = True,
133136
srcs = ["cloud-sql-proxy/pyspark_metastore_test.py"],
134137
)
138+
139+
py_test(
140+
name = "test_hive_lineage",
141+
size = "enormous",
142+
srcs = ["hive-lineage/test_hive_lineage.py"],
143+
data = [
144+
"hive-lineage/hive-lineage.sh",
145+
"hive-lineage/hivetest.hive",
146+
],
147+
local = True,
148+
shard_count = 3,
149+
deps = [
150+
"//integration_tests:dataproc_test_case",
151+
"@io_abseil_py//absl/testing:parameterized",
152+
],
153+
)

hive-lineage/README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Hive Lineage Initialization Action
2+
3+
## Using the initialization action
4+
5+
**:warning: NOTICE:** See
6+
[best practices](/README.md#how-initialization-actions-are-used) of using
7+
initialization actions in production.
8+
9+
You can use this initialization action to create a new Dataproc cluster with Lineage enabled for Hive jobs.
10+
Note that this feature is in preview for now.
11+
12+
```shell
13+
REGION=<region>
14+
CLUSTER_NAME=<cluster_name>
15+
gcloud dataproc clusters create ${CLUSTER_NAME} \
16+
--region ${REGION} \
17+
--scopes cloud-platform \
18+
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh
19+
```
20+
21+
If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well.
22+
See [connectors](../connectors/README.md) init action.
23+
24+
```shell
25+
REGION=<region>
26+
CLUSTER_NAME=<cluster_name>
27+
gcloud dataproc clusters create ${CLUSTER_NAME} \
28+
--region ${REGION} \
29+
--scopes cloud-platform \
30+
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh \
31+
--metadata hive-bigquery-connector-version=2.0.3
32+
```

hive-lineage/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# required for integration tests

hive-lineage/hive-lineage.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2025 Google LLC and contributors
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS-IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# This initialization script installs the required
18+
# jars and sets the hive conf to enable lineage.
19+
20+
set -euxo pipefail
21+
22+
function prepare_env() {
23+
export HIVE_HOME="/usr/lib/hive"
24+
export HIVE_CONF_DIR="/etc/hive/conf"
25+
export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml"
26+
export HIVE_LIB_DIR="$HIVE_HOME/lib"
27+
export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage"
28+
export HIVE_OL_HOOK_VERSION="1.0.0-preview"
29+
export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook"
30+
}
31+
32+
function set_hive_lineage_conf() {
33+
declare -A properties=(
34+
["hive.exec.post.hooks"]="$HIVE_OL_HOOK"
35+
["hive.exec.failure.hooks"]="$HIVE_OL_HOOK"
36+
["hive.openlineage.transport.type"]="gcplineage"
37+
["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace
38+
)
39+
echo "Setting hive conf to enable lineage"
40+
for key in "${!properties[@]}"; do
41+
bdconfig set_property \
42+
--configuration_file="$HIVE_CONF_FILE" \
43+
--name "$key" \
44+
--value "${properties[$key]}"
45+
done
46+
}
47+
48+
function install_jars() {
49+
echo "Installing openlineage-hive hook"
50+
gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar"
51+
}
52+
53+
function restart_hive_server2_master() {
54+
ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role)
55+
if [[ "${ROLE}" == 'Master' ]]; then
56+
echo "Restarting hive-server2"
57+
sudo systemctl restart hive-server2.service
58+
fi
59+
}
60+
61+
prepare_env
62+
install_jars
63+
set_hive_lineage_conf
64+
restart_hive_server2_master

hive-lineage/hivetest.hive

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
DROP TABLE IF EXISTS validate_hive_tbl;
2+
DROP TABLE IF EXISTS grouped_tbl;
3+
4+
CREATE EXTERNAL TABLE validate_hive_tbl (
5+
shell_user STRING,
6+
dummy STRING,
7+
uid INT,
8+
gid INT,
9+
name STRING,
10+
home STRING,
11+
shell STRING
12+
)
13+
ROW FORMAT DELIMITED
14+
FIELDS TERMINATED BY ':';
15+
16+
CREATE TABLE grouped_tbl
17+
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
18+
AS SELECT shell, COUNT(*) shell_count
19+
FROM validate_hive_tbl
20+
GROUP BY shell
21+
ORDER BY shell_count DESC, shell DESC;
22+
23+
SELECT * from grouped_tbl;

hive-lineage/test_hive_lineage.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from absl.testing import absltest
2+
from absl.testing import parameterized
3+
4+
from integration_tests.dataproc_test_case import DataprocTestCase
5+
6+
class HiveLineageTestCase(DataprocTestCase):
7+
COMPONENT = "hive-lineage"
8+
INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"]
9+
TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive"
10+
11+
def __submit_hive_job(self, cluster_name):
12+
self.assert_dataproc_job(
13+
cluster_name, 'hive', '--file={}/{}'.format(self.INIT_ACTIONS_REPO,
14+
self.TEST_SCRIPT_FILE))
15+
def verify_cluster(self, name):
16+
self.__submit_hive_job(name)
17+
18+
@parameterized.parameters(
19+
'STANDARD',
20+
'HA',
21+
)
22+
def test_hive_job_success(self, configuration):
23+
self.createCluster(configuration,
24+
self.INIT_ACTIONS,
25+
scopes='cloud-platform')
26+
self.verify_cluster(self.getClusterName())
27+
28+
29+
if __name__ == "__main__":
30+
absltest.main()

0 commit comments

Comments
 (0)