From cc8e4033490cc57b67b77d3f2517a77e0df068e2 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Fri, 17 Oct 2025 12:18:40 -0400 Subject: [PATCH 01/99] Starting to work on export part sanity test. --- s3/regression.py | 3 + s3/tests/export_part.py | 139 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 s3/tests/export_part.py diff --git a/s3/regression.py b/s3/regression.py index ac1500533..44d836514 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -603,6 +603,9 @@ def minio_regression( Feature(test=load("s3.tests.remote_s3_function", "minio"))( uri=uri_bucket_file, bucket_prefix=bucket_prefix ) + Feature(test=load("s3.tests.export_part", "minio"))( + uri=uri_bucket_file, bucket_prefix=bucket_prefix + ) @TestFeature diff --git a/s3/tests/export_part.py b/s3/tests/export_part.py new file mode 100644 index 000000000..7b1601afb --- /dev/null +++ b/s3/tests/export_part.py @@ -0,0 +1,139 @@ +import time +import json + +from testflows.core import * +from testflows.asserts import error +from s3.tests.common import * +from helpers.tables import * + + +@TestStep(When) +def export_events(self): + """Get the number of successful parts exports from the system.events table.""" + node = self.context.node + output = node.query( + "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + exitcode=0, + ).output + return { + row["name"]: int(row["value"]) + for row in [json.loads(row) for row in output.splitlines()] + } + + +@TestStep(When) +def export_part(self, parts, source, destination): + """Alter export of parts.""" + node = self.context.node + for part in parts: + node.query( + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", + # settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=0, + ) + + +@TestStep(When) +def get_parts(self, table): + """Get all parts for a table.""" + node = self.context.node + output = node.query( + f"SELECT name FROM system.parts WHERE table = '{table.name}'", exitcode=0 + ).output + return [row.strip() for row in output.splitlines()] + + +@TestStep(When) +def stop_merges(self, table): + """Stop merges for a table.""" + node = self.context.node + node.query(f"SYSTEM STOP MERGES {table.name}", exitcode=0) + + +@TestScenario +def sanity(self): + """Check that ClickHouse can export data parts to S3 storage.""" + node = self.context.node + + with Given("I create a source table"): + source = create_table( + name="source_table_" + getuid(), + columns=[ + Column(name="p", datatype=UInt16()), + Column(name="i", datatype=UInt64()), + ], + partition_by="p", + order_by="tuple()", + engine="MergeTree", + ) + + with And("I create a destination table"): + table_name = "destination_table_" + getuid() + destination = create_table( + name=table_name, + columns=[ + Column(name="p", datatype=UInt16()), + Column(name="i", datatype=UInt64()), + ], + partition_by="p", + engine=f""" + S3( + '{self.context.uri}', + '{self.context.access_key_id}', + '{self.context.secret_access_key}', + format='Parquet', + compression='auto', + partition_strategy='hive' + ) + """, + ) + + with And("I turn off merges for source table"): + stop_merges(source) + + with When("I insert data into the source table"): + for i in range(10): + node.query(f"INSERT INTO {source.name} VALUES ({i % 10}, {i})", exitcode=0) + + with And("I get a list of parts for source table"): + parts = get_parts(source) + + with And("I read current export events"): + events_before = export_events() + + with When("I export parts to the destination table"): + for _ in range(10): + export_part(parts, source, destination) + + with And("I check that all exports are successful"): + events_after = export_events() + assert ( + events_after["PartsExports"] == events_before["PartsExports"] + 10 + ), error() + + +@TestOutline(Feature) +@Requirements( + # TBD +) +def outline(self): + """Run export part scenarios.""" + + for scenario in loads(current_module(), Scenario): + Scenario(run=scenario, flags=TE) + + +@TestFeature +@Requirements() +@Name("export part") +def minio(self, uri, bucket_prefix): + + with Given("a temporary s3 path"): + temp_s3_path = temporary_bucket_path( + bucket_prefix=f"{bucket_prefix}/export_part" + ) + + self.context.uri = f"{uri}export_part/{temp_s3_path}/" + self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" + + outline() From e1f20862244cfe511f4c67dd8ae2cdd9783ac7ad Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Fri, 17 Oct 2025 12:25:56 -0400 Subject: [PATCH 02/99] Fixing export_events. --- s3/tests/export_part.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/s3/tests/export_part.py b/s3/tests/export_part.py index 7b1601afb..2c309eb11 100644 --- a/s3/tests/export_part.py +++ b/s3/tests/export_part.py @@ -89,26 +89,26 @@ def sanity(self): ) with And("I turn off merges for source table"): - stop_merges(source) + stop_merges(table=source) with When("I insert data into the source table"): for i in range(10): node.query(f"INSERT INTO {source.name} VALUES ({i % 10}, {i})", exitcode=0) with And("I get a list of parts for source table"): - parts = get_parts(source) + parts = get_parts(table=source) with And("I read current export events"): events_before = export_events() with When("I export parts to the destination table"): for _ in range(10): - export_part(parts, source, destination) + export_part(parts=parts, source=source, destination=destination) with And("I check that all exports are successful"): events_after = export_events() assert ( - events_after["PartsExports"] == events_before["PartsExports"] + 10 + events_after["PartsExports"] == events_before.get("PartsExports", 0) + 10 ), error() From 6c6adaa427d8ddd15e6ff72dfbe6cc6c7fdd8ddb Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Fri, 17 Oct 2025 12:49:25 -0400 Subject: [PATCH 03/99] Updates. --- helpers/cluster.py | 2 +- helpers/tables.py | 24 +++++- s3/regression.py | 2 +- s3/tests/export_part.py | 139 -------------------------------- s3/tests/export_part/feature.py | 68 ++++++++++++++++ s3/tests/export_part/steps.py | 85 +++++++++++++++++++ 6 files changed, 177 insertions(+), 143 deletions(-) delete mode 100644 s3/tests/export_part.py create mode 100644 s3/tests/export_part/feature.py create mode 100644 s3/tests/export_part/steps.py diff --git a/helpers/cluster.py b/helpers/cluster.py index 434f68bdb..3c70861c8 100755 --- a/helpers/cluster.py +++ b/helpers/cluster.py @@ -1006,7 +1006,7 @@ def query( sql, message=None, exitcode=None, - steps=True, + steps=False, no_checks=False, raise_on_exception=False, ignore_exception=False, diff --git a/helpers/tables.py b/helpers/tables.py index b7c385c87..cc101daff 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -343,9 +343,10 @@ def generate_all_map_column_types(): class Table: - def __init__(self, name, columns, engine): + def __init__(self, name, columns, partition_by, engine): self.name = name self.columns = columns + self.partition_by = partition_by self.engine = engine def insert_test_data( @@ -401,6 +402,23 @@ def insert_test_data( return result, values return result + def get_parts(self, node=None): + """Get all parts for a table.""" + if node is None: + node = current().context.node + + output = node.query( + f"SELECT name FROM system.parts WHERE table = '{self.name}'", exitcode=0 + ).output + return [row.strip() for row in output.splitlines()] + + def stop_merges(self, node=None): + """Stop merges for a table.""" + if node is None: + node = current().context.node + + node.query(f"SYSTEM STOP MERGES {self.name}", exitcode=0) + @TestStep(Given) def create_table( @@ -484,7 +502,9 @@ def create_table( settings=settings, ) - yield Table(name, columns, engine) + yield Table( + name=name, columns=columns, partition_by=partition_by, engine=engine + ) finally: with Finally(f"drop the table {name}"): diff --git a/s3/regression.py b/s3/regression.py index 44d836514..77717e48d 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -603,7 +603,7 @@ def minio_regression( Feature(test=load("s3.tests.remote_s3_function", "minio"))( uri=uri_bucket_file, bucket_prefix=bucket_prefix ) - Feature(test=load("s3.tests.export_part", "minio"))( + Feature(test=load("s3.tests.export_part.feature", "minio"))( uri=uri_bucket_file, bucket_prefix=bucket_prefix ) diff --git a/s3/tests/export_part.py b/s3/tests/export_part.py deleted file mode 100644 index 2c309eb11..000000000 --- a/s3/tests/export_part.py +++ /dev/null @@ -1,139 +0,0 @@ -import time -import json - -from testflows.core import * -from testflows.asserts import error -from s3.tests.common import * -from helpers.tables import * - - -@TestStep(When) -def export_events(self): - """Get the number of successful parts exports from the system.events table.""" - node = self.context.node - output = node.query( - "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", - exitcode=0, - ).output - return { - row["name"]: int(row["value"]) - for row in [json.loads(row) for row in output.splitlines()] - } - - -@TestStep(When) -def export_part(self, parts, source, destination): - """Alter export of parts.""" - node = self.context.node - for part in parts: - node.query( - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", - # settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=0, - ) - - -@TestStep(When) -def get_parts(self, table): - """Get all parts for a table.""" - node = self.context.node - output = node.query( - f"SELECT name FROM system.parts WHERE table = '{table.name}'", exitcode=0 - ).output - return [row.strip() for row in output.splitlines()] - - -@TestStep(When) -def stop_merges(self, table): - """Stop merges for a table.""" - node = self.context.node - node.query(f"SYSTEM STOP MERGES {table.name}", exitcode=0) - - -@TestScenario -def sanity(self): - """Check that ClickHouse can export data parts to S3 storage.""" - node = self.context.node - - with Given("I create a source table"): - source = create_table( - name="source_table_" + getuid(), - columns=[ - Column(name="p", datatype=UInt16()), - Column(name="i", datatype=UInt64()), - ], - partition_by="p", - order_by="tuple()", - engine="MergeTree", - ) - - with And("I create a destination table"): - table_name = "destination_table_" + getuid() - destination = create_table( - name=table_name, - columns=[ - Column(name="p", datatype=UInt16()), - Column(name="i", datatype=UInt64()), - ], - partition_by="p", - engine=f""" - S3( - '{self.context.uri}', - '{self.context.access_key_id}', - '{self.context.secret_access_key}', - format='Parquet', - compression='auto', - partition_strategy='hive' - ) - """, - ) - - with And("I turn off merges for source table"): - stop_merges(table=source) - - with When("I insert data into the source table"): - for i in range(10): - node.query(f"INSERT INTO {source.name} VALUES ({i % 10}, {i})", exitcode=0) - - with And("I get a list of parts for source table"): - parts = get_parts(table=source) - - with And("I read current export events"): - events_before = export_events() - - with When("I export parts to the destination table"): - for _ in range(10): - export_part(parts=parts, source=source, destination=destination) - - with And("I check that all exports are successful"): - events_after = export_events() - assert ( - events_after["PartsExports"] == events_before.get("PartsExports", 0) + 10 - ), error() - - -@TestOutline(Feature) -@Requirements( - # TBD -) -def outline(self): - """Run export part scenarios.""" - - for scenario in loads(current_module(), Scenario): - Scenario(run=scenario, flags=TE) - - -@TestFeature -@Requirements() -@Name("export part") -def minio(self, uri, bucket_prefix): - - with Given("a temporary s3 path"): - temp_s3_path = temporary_bucket_path( - bucket_prefix=f"{bucket_prefix}/export_part" - ) - - self.context.uri = f"{uri}export_part/{temp_s3_path}/" - self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" - - outline() diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py new file mode 100644 index 000000000..248d39163 --- /dev/null +++ b/s3/tests/export_part/feature.py @@ -0,0 +1,68 @@ +from testflows.core import * +from testflows.asserts import error + +from helpers.tables import * +from s3.tests.common import * +from s3.tests.export_part.steps import * + + +@TestScenario +def sanity(self): + """Check that ClickHouse can export data parts to S3 storage.""" + node = self.context.node + + with Given("I create a source table"): + source = create_source_table() + + with And("I create a destination table"): + destination = create_destination_table(source=source) + + with And("I turn off merges for source table"): + source.stop_merges() + + with When("I insert data into the source table"): + for i in range(10): + node.query(f"INSERT INTO {source.name} VALUES ({i % 10}, {i})", exitcode=0) + + with And("I get a list of parts for source table"): + parts = source.get_parts() + + with And("I read current export events"): + events_before = export_events() + + with When("I export parts to the destination table"): + for _ in range(10): + export_part(parts=parts, source=source, destination=destination) + + with And("I check that all exports are successful"): + events_after = export_events() + assert ( + events_after["PartsExports"] == events_before.get("PartsExports", 0) + 10 + ), error() + + +@TestOutline(Feature) +@Requirements( + # TBD +) +def outline(self): + """Run export part scenarios.""" + + for scenario in loads(current_module(), Scenario): + Scenario(run=scenario, flags=TE) + + +@TestFeature +@Requirements() +@Name("export part") +def minio(self, uri, bucket_prefix): + + with Given("a temporary s3 path"): + temp_s3_path = temporary_bucket_path( + bucket_prefix=f"{bucket_prefix}/export_part" + ) + + self.context.uri = f"{uri}export_part/{temp_s3_path}/" + self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" + + outline() diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py new file mode 100644 index 000000000..4c00eb446 --- /dev/null +++ b/s3/tests/export_part/steps.py @@ -0,0 +1,85 @@ +import json + +from testflows.core import * +from helpers.common import getuid +from helpers.tables import * + + +@TestStep(When) +def export_events(self): + """Get the number of successful parts exports from the system.events table.""" + node = self.context.node + output = node.query( + "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + exitcode=0, + ).output + return { + row["name"]: int(row["value"]) + for row in [json.loads(row) for row in output.splitlines()] + } + + +@TestStep(When) +def export_part(self, parts, source, destination): + """Alter export of parts.""" + node = self.context.node + for part in parts: + node.query( + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", + # settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=0, + ) + + +@TestStep(When) +def create_source_table( + self, columns=None, partition_by=None, order_by=None, engine=None +): + """Create a source table.""" + + if columns is None: + columns = [ + Column(name="p", datatype=UInt16()), + Column(name="i", datatype=UInt64()), + ] + if partition_by is None: + partition_by = columns[0].name + if order_by is None: + order_by = "tuple()" + if engine is None: + engine = "MergeTree" + + source = create_table( + name="source_table_" + getuid(), + columns=columns, + partition_by=partition_by, + order_by=order_by, + engine=engine, + ) + + return source + + +@TestStep(When) +def create_destination_table(self, source, engine=None): + """Create a destination table.""" + if engine is None: + engine = f""" + S3( + '{self.context.uri}', + '{self.context.access_key_id}', + '{self.context.secret_access_key}', + format='Parquet', + compression='auto', + partition_strategy='hive' + ) + """ + + destination = create_table( + name="destination_table_" + getuid(), + columns=source.columns, + partition_by=source.partition_by, + engine=engine, + ) + + return destination From ebaf3fc11cb2a4262bd00f2ca47b29975574be0b Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Fri, 17 Oct 2025 13:15:36 -0400 Subject: [PATCH 04/99] Updating sanity test. --- s3/tests/export_part/feature.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 248d39163..cfe0aaf75 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -9,7 +9,6 @@ @TestScenario def sanity(self): """Check that ClickHouse can export data parts to S3 storage.""" - node = self.context.node with Given("I create a source table"): source = create_source_table() @@ -20,9 +19,8 @@ def sanity(self): with And("I turn off merges for source table"): source.stop_merges() - with When("I insert data into the source table"): - for i in range(10): - node.query(f"INSERT INTO {source.name} VALUES ({i % 10}, {i})", exitcode=0) + with When("I insert test data into the source table"): + source.insert_test_data(row_count=10, cardinality=1) with And("I get a list of parts for source table"): parts = source.get_parts() @@ -30,15 +28,16 @@ def sanity(self): with And("I read current export events"): events_before = export_events() - with When("I export parts to the destination table"): - for _ in range(10): - export_part(parts=parts, source=source, destination=destination) + with And("I export parts to the destination table"): + export_part(parts=parts, source=source, destination=destination) - with And("I check that all exports are successful"): + with Then("I check system.events that all exports are successful"): events_after = export_events() - assert ( - events_after["PartsExports"] == events_before.get("PartsExports", 0) + 10 - ), error() + assert events_after["PartsExports"] == events_before.get( + "PartsExports", 0 + ) + len(parts), error() + + # FIXME: read back data and assert destination matches source @TestOutline(Feature) From 9ce37c0a428a9fabfc17a55b30e84b89d53cedb3 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Sun, 19 Oct 2025 15:27:38 -0400 Subject: [PATCH 05/99] Export part sanity test --- s3/tests/export_part/feature.py | 11 +++++++---- s3/tests/export_part/steps.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index cfe0aaf75..5fa99003a 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -23,21 +23,24 @@ def sanity(self): source.insert_test_data(row_count=10, cardinality=1) with And("I get a list of parts for source table"): - parts = source.get_parts() + source_parts = source.get_parts() with And("I read current export events"): events_before = export_events() with And("I export parts to the destination table"): - export_part(parts=parts, source=source, destination=destination) + export_part(parts=source_parts, source=source, destination=destination) with Then("I check system.events that all exports are successful"): events_after = export_events() assert events_after["PartsExports"] == events_before.get( "PartsExports", 0 - ) + len(parts), error() + ) + len(source_parts), error() - # FIXME: read back data and assert destination matches source + with And("I read back data and assert destination matches source"): + destination_data = destination.select_ordered_by_partition_and_index() + source_data = source.select_ordered_by_partition_and_index() + assert destination_data == source_data, error() @TestOutline(Feature) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 4c00eb446..11b835af2 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -82,4 +82,4 @@ def create_destination_table(self, source, engine=None): engine=engine, ) - return destination + return destination \ No newline at end of file From 84333c891bf3d66fc6ac1f1cd9b776e172585ee4 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Sun, 19 Oct 2025 15:29:10 -0400 Subject: [PATCH 06/99] Helpers tables querying common function --- helpers/tables.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/helpers/tables.py b/helpers/tables.py index cc101daff..aed670df5 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -418,6 +418,17 @@ def stop_merges(self, node=None): node = current().context.node node.query(f"SYSTEM STOP MERGES {self.name}", exitcode=0) + + def query(self, query, node=None): + """Query data from a table.""" + if node is None: + node = current().context.node + + return node.query(query, exitcode=0).output + + def select_ordered_by_partition_and_index(self, node=None): + """Select all data from a table ordered by partition and index columns.""" + return self.query(f"SELECT * FROM {self.name} ORDER BY p, i", node=node) @TestStep(Given) From 71828e6bd2ada272b70cba5c2bd885f063c1699c Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Sun, 19 Oct 2025 15:34:11 -0400 Subject: [PATCH 07/99] Clean up export_part sanity test --- s3/tests/export_part/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 5fa99003a..b4aad16a8 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -19,8 +19,8 @@ def sanity(self): with And("I turn off merges for source table"): source.stop_merges() - with When("I insert test data into the source table"): - source.insert_test_data(row_count=10, cardinality=1) + with When("I insert random test data into the source table"): + source.insert_test_data() # default row_count=10, cardinality=1 with And("I get a list of parts for source table"): source_parts = source.get_parts() From dd5e27f2707735911d1399892b9d1ff87637059c Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Sun, 19 Oct 2025 17:20:54 -0400 Subject: [PATCH 08/99] invalid part export test --- s3/tests/export_part/feature.py | 31 +++++++++++++++++++++++++++++++ s3/tests/export_part/steps.py | 14 ++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index b4aad16a8..4bb0aa5fe 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -6,6 +6,10 @@ from s3.tests.export_part.steps import * +# TODO: simplify tests by using the same source and destination tables for all scenarios? +# is this better than using different tables for each scenario? +# regardless, i think the tests can be cleaned up, maybe create a "setup" step for making source/destination tables, turning off merges, inserting test data, etc. + @TestScenario def sanity(self): """Check that ClickHouse can export data parts to S3 storage.""" @@ -43,6 +47,33 @@ def sanity(self): assert destination_data == source_data, error() +@TestScenario +def invalid_part_name(self): + """Check that exporting a non-existent part returns the correct error.""" + + with Given("I create a source table"): + source = create_source_table() + + with And("I create a destination table"): + destination = create_destination_table(source=source) + + with And("I turn off merges for source table"): + source.stop_merges() + + with When("I insert random test data into the source table"): + source.insert_test_data() # default row_count=10, cardinality=1 + + with And("I create an invalid part name"): + invalid_part_name = "in_va_lid_part" + + with Then("I try to export the invalid part and expect an error"): + results = export_part(parts=[invalid_part_name], source=source, destination=destination, exitcode=1) + assert len(results) == 1, error() + # note(f"Result: {results[0].output}") + assert results[0].exitcode == 233, error() + assert f"Unexpected part name: {invalid_part_name}" in results[0].output, error() + + @TestOutline(Feature) @Requirements( # TBD diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 11b835af2..61f4cc90e 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -20,16 +20,22 @@ def export_events(self): @TestStep(When) -def export_part(self, parts, source, destination): +def export_part(self, parts, source, destination, exitcode=0): """Alter export of parts.""" node = self.context.node + + no_checks = exitcode != 0 + results = [] + for part in parts: - node.query( + results.append(node.query( f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", # settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=0, - ) + exitcode=exitcode, + no_checks=no_checks + )) + return results @TestStep(When) def create_source_table( From 67cff9c4b38e3517509d70c11e03bbe36432a350 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 20 Oct 2025 08:32:01 -0400 Subject: [PATCH 09/99] Duplicate exports test --- s3/tests/export_part/feature.py | 77 +++++++++++++++++++++++++++++---- s3/tests/export_part/steps.py | 1 + 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 4bb0aa5fe..13aaa32d3 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -1,3 +1,5 @@ +import random + from testflows.core import * from testflows.asserts import error @@ -6,9 +8,13 @@ from s3.tests.export_part.steps import * -# TODO: simplify tests by using the same source and destination tables for all scenarios? -# is this better than using different tables for each scenario? -# regardless, i think the tests can be cleaned up, maybe create a "setup" step for making source/destination tables, turning off merges, inserting test data, etc. +# TODO: Simplify tests by using the same source and destination tables for all scenarios? + # Is this better than using different tables for each scenario? +# Regardless, I think the tests can be cleaned up. Maybe create a "setup" step for making + # source/destination tables, turning off merges, inserting test data, etc. +# Also, the insert_test_data function is not actually random (relies on a seed), so it will insert + # the same data which can cause exports to fail if the part name is the same. + # This can be seen by running sanity after duplicate_exports. @TestScenario def sanity(self): @@ -24,7 +30,7 @@ def sanity(self): source.stop_merges() with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + source.insert_test_data(random=random.Random(0)) # default row_count=10, cardinality=1 with And("I get a list of parts for source table"): source_parts = source.get_parts() @@ -33,17 +39,24 @@ def sanity(self): events_before = export_events() with And("I export parts to the destination table"): + note(f"Exporting parts: {source_parts}") + note(f"Source table: {source.name}") + note(f"Destination table: {destination.name}") export_part(parts=source_parts, source=source, destination=destination) with Then("I check system.events that all exports are successful"): events_after = export_events() - assert events_after["PartsExports"] == events_before.get( - "PartsExports", 0 - ) + len(source_parts), error() + note(f"Events before: {events_before}") + note(f"Events after: {events_after}") + total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) + total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) + assert total_exports_after == total_exports_before + len(source_parts), error() with And("I read back data and assert destination matches source"): destination_data = destination.select_ordered_by_partition_and_index() source_data = source.select_ordered_by_partition_and_index() + note(f"Destination data: {destination_data}") + note(f"Source data: {source_data}") assert destination_data == source_data, error() @@ -61,7 +74,7 @@ def invalid_part_name(self): source.stop_merges() with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + source.insert_test_data(random=random.Random(1)) # default row_count=10, cardinality=1 with And("I create an invalid part name"): invalid_part_name = "in_va_lid_part" @@ -74,6 +87,52 @@ def invalid_part_name(self): assert f"Unexpected part name: {invalid_part_name}" in results[0].output, error() +@TestScenario +def duplicate_exports(self): + """Check that duplicate export attempts are properly tracked in system.events.""" + + with Given("I create a source table"): + source = create_source_table() + + with And("I create a destination table"): + destination = create_destination_table(source=source) + + with And("I turn off merges for source table"): + source.stop_merges() + + with When("I insert random test data into the source table"): + source.insert_test_data(random=random.Random(2)) # default row_count=10, cardinality=1 + + with And("I get a list of parts for source table"): + source_parts = source.get_parts() + test_part = source_parts[1] + + with And("I read initial export events"): + events_initial = export_events() + initial_exports = events_initial.get("PartsExports", 0) + initial_duplicates = events_initial.get("PartsExportDuplicated", 0) + note(f"Initial events - Exports: {initial_exports}, Duplicates: {initial_duplicates}") + + with When("I export the same part twice"): + note(f"Exporting part: {test_part}") + note(f"Source table: {source.name}") + note(f"Destination table: {destination.name}") + # Export same part twice + export_part(parts=[test_part], source=source, destination=destination) + export_part(parts=[test_part], source=source, destination=destination) + + with Then("I check system.events for duplicate tracking"): + events_final = export_events() + final_exports = events_final.get("PartsExports", 0) + final_duplicates = events_final.get("PartsExportDuplicated", 0) + + note(f"Final events - Exports: {final_exports}, Duplicates: {final_duplicates}") + # 1 successful export + assert final_exports - initial_exports == 1, error() + # 1 of the exports was counted as a duplicate + assert final_duplicates - initial_duplicates == 1, error() + + @TestOutline(Feature) @Requirements( # TBD @@ -98,4 +157,4 @@ def minio(self, uri, bucket_prefix): self.context.uri = f"{uri}export_part/{temp_s3_path}/" self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" - outline() + outline() \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 61f4cc90e..145085fee 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -27,6 +27,7 @@ def export_part(self, parts, source, destination, exitcode=0): no_checks = exitcode != 0 results = [] + # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug for part in parts: results.append(node.query( f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", From cf928d0ab99bc8fcf0bc62bda9d3306d1d4f21b0 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 20 Oct 2025 08:59:00 -0400 Subject: [PATCH 10/99] Updates. --- s3/tests/export_part/feature.py | 11 ++++------- s3/tests/export_part/steps.py | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 13aaa32d3..22303bc45 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -148,13 +148,10 @@ def outline(self): @Requirements() @Name("export part") def minio(self, uri, bucket_prefix): + self.context.uri_base = uri + self.context.bucket_prefix = bucket_prefix with Given("a temporary s3 path"): - temp_s3_path = temporary_bucket_path( - bucket_prefix=f"{bucket_prefix}/export_part" - ) + create_temp_bucket() - self.context.uri = f"{uri}export_part/{temp_s3_path}/" - self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" - - outline() \ No newline at end of file + outline() diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 145085fee..e0df228dd 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -3,6 +3,23 @@ from testflows.core import * from helpers.common import getuid from helpers.tables import * +from s3.tests.common import temporary_bucket_path + +@TestStep(Given) +def create_temp_bucket(self, uri=None, bucket_prefix=None): + """Create temporary s3 bucket.""" + if uri is None: + uri = self.context.uri_base + + if bucket_prefix is None: + bucket_prefix = self.context.bucket_prefix + + temp_s3_path = temporary_bucket_path( + bucket_prefix=f"{bucket_prefix}/export_part" + ) + + self.context.uri = f"{uri}export_part/{temp_s3_path}/" + self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" @TestStep(When) @@ -70,12 +87,14 @@ def create_source_table( @TestStep(When) def create_destination_table(self, source, engine=None): """Create a destination table.""" + name = "destination_table_" + getuid() if engine is None: engine = f""" S3( '{self.context.uri}', '{self.context.access_key_id}', '{self.context.secret_access_key}', + filename='{name}', format='Parquet', compression='auto', partition_strategy='hive' @@ -83,10 +102,10 @@ def create_destination_table(self, source, engine=None): """ destination = create_table( - name="destination_table_" + getuid(), + name=name, columns=source.columns, partition_by=source.partition_by, engine=engine, ) - return destination \ No newline at end of file + return destination From f06697480e3eba9b06891e66b5153df566b55ddb Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 20 Oct 2025 12:24:58 -0400 Subject: [PATCH 11/99] Clean up test setup, remove some debugging notes, implement unique s3 for each scenario --- s3/tests/export_part/feature.py | 48 +++++++-------------------------- s3/tests/export_part/steps.py | 12 +++++++++ 2 files changed, 21 insertions(+), 39 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 22303bc45..1b6765182 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,25 +12,16 @@ # Is this better than using different tables for each scenario? # Regardless, I think the tests can be cleaned up. Maybe create a "setup" step for making # source/destination tables, turning off merges, inserting test data, etc. -# Also, the insert_test_data function is not actually random (relies on a seed), so it will insert - # the same data which can cause exports to fail if the part name is the same. - # This can be seen by running sanity after duplicate_exports. @TestScenario def sanity(self): """Check that ClickHouse can export data parts to S3 storage.""" - with Given("I create a source table"): - source = create_source_table() - - with And("I create a destination table"): - destination = create_destination_table(source=source) - - with And("I turn off merges for source table"): - source.stop_merges() + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() with When("I insert random test data into the source table"): - source.insert_test_data(random=random.Random(0)) # default row_count=10, cardinality=1 + source.insert_test_data() # default row_count=10, cardinality=1 with And("I get a list of parts for source table"): source_parts = source.get_parts() @@ -64,17 +55,11 @@ def sanity(self): def invalid_part_name(self): """Check that exporting a non-existent part returns the correct error.""" - with Given("I create a source table"): - source = create_source_table() - - with And("I create a destination table"): - destination = create_destination_table(source=source) - - with And("I turn off merges for source table"): - source.stop_merges() + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() with When("I insert random test data into the source table"): - source.insert_test_data(random=random.Random(1)) # default row_count=10, cardinality=1 + source.insert_test_data() # default row_count=10, cardinality=1 with And("I create an invalid part name"): invalid_part_name = "in_va_lid_part" @@ -91,17 +76,11 @@ def invalid_part_name(self): def duplicate_exports(self): """Check that duplicate export attempts are properly tracked in system.events.""" - with Given("I create a source table"): - source = create_source_table() - - with And("I create a destination table"): - destination = create_destination_table(source=source) - - with And("I turn off merges for source table"): - source.stop_merges() + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() with When("I insert random test data into the source table"): - source.insert_test_data(random=random.Random(2)) # default row_count=10, cardinality=1 + source.insert_test_data() # default row_count=10, cardinality=1 with And("I get a list of parts for source table"): source_parts = source.get_parts() @@ -111,13 +90,8 @@ def duplicate_exports(self): events_initial = export_events() initial_exports = events_initial.get("PartsExports", 0) initial_duplicates = events_initial.get("PartsExportDuplicated", 0) - note(f"Initial events - Exports: {initial_exports}, Duplicates: {initial_duplicates}") with When("I export the same part twice"): - note(f"Exporting part: {test_part}") - note(f"Source table: {source.name}") - note(f"Destination table: {destination.name}") - # Export same part twice export_part(parts=[test_part], source=source, destination=destination) export_part(parts=[test_part], source=source, destination=destination) @@ -126,7 +100,6 @@ def duplicate_exports(self): final_exports = events_final.get("PartsExports", 0) final_duplicates = events_final.get("PartsExportDuplicated", 0) - note(f"Final events - Exports: {final_exports}, Duplicates: {final_duplicates}") # 1 successful export assert final_exports - initial_exports == 1, error() # 1 of the exports was counted as a duplicate @@ -151,7 +124,4 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - with Given("a temporary s3 path"): - create_temp_bucket() - outline() diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index e0df228dd..01c8a6dfc 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -5,6 +5,18 @@ from helpers.tables import * from s3.tests.common import temporary_bucket_path + +@TestStep(Given) +def create_source_and_destination_tables(self, stop_merges=True): + """Create source and destination tables.""" + create_temp_bucket() + source = create_source_table() + destination = create_destination_table(source=source) + if stop_merges: + source.stop_merges() + return source, destination + + @TestStep(Given) def create_temp_bucket(self, uri=None, bucket_prefix=None): """Create temporary s3 bucket.""" From 1f9e43da69afc696c1b58ac19b8e26a2458ecb6e Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 20 Oct 2025 14:03:15 -0400 Subject: [PATCH 12/99] Clean up notes --- s3/tests/export_part/feature.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 1b6765182..d2ea9e945 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -8,11 +8,6 @@ from s3.tests.export_part.steps import * -# TODO: Simplify tests by using the same source and destination tables for all scenarios? - # Is this better than using different tables for each scenario? -# Regardless, I think the tests can be cleaned up. Maybe create a "setup" step for making - # source/destination tables, turning off merges, inserting test data, etc. - @TestScenario def sanity(self): """Check that ClickHouse can export data parts to S3 storage.""" @@ -30,15 +25,10 @@ def sanity(self): events_before = export_events() with And("I export parts to the destination table"): - note(f"Exporting parts: {source_parts}") - note(f"Source table: {source.name}") - note(f"Destination table: {destination.name}") export_part(parts=source_parts, source=source, destination=destination) with Then("I check system.events that all exports are successful"): events_after = export_events() - note(f"Events before: {events_before}") - note(f"Events after: {events_after}") total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) assert total_exports_after == total_exports_before + len(source_parts), error() @@ -46,8 +36,6 @@ def sanity(self): with And("I read back data and assert destination matches source"): destination_data = destination.select_ordered_by_partition_and_index() source_data = source.select_ordered_by_partition_and_index() - note(f"Destination data: {destination_data}") - note(f"Source data: {source_data}") assert destination_data == source_data, error() From 33d85e26a474e6c479781a610a5c4f8e5b1c0210 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Wed, 22 Oct 2025 11:35:01 -0400 Subject: [PATCH 13/99] Initial categorization --- s3/tests/export_part/clusters.py | 15 +++ s3/tests/export_part/error_handling.py | 34 ++++++ s3/tests/export_part/feature.py | 135 ++++------------------ s3/tests/export_part/sanity.py | 44 +++++++ s3/tests/export_part/steps.py | 6 +- s3/tests/export_part/system_monitoring.py | 46 ++++++++ 6 files changed, 163 insertions(+), 117 deletions(-) create mode 100644 s3/tests/export_part/clusters.py create mode 100644 s3/tests/export_part/error_handling.py create mode 100644 s3/tests/export_part/sanity.py create mode 100644 s3/tests/export_part/system_monitoring.py diff --git a/s3/tests/export_part/clusters.py b/s3/tests/export_part/clusters.py new file mode 100644 index 000000000..0f8d54995 --- /dev/null +++ b/s3/tests/export_part/clusters.py @@ -0,0 +1,15 @@ +from testflows.core import * +from testflows.asserts import error + +from s3.tests.export_part.steps import * + + +# @TestScenario + + +@TestFeature +@Name("clusters") +def feature(self): + """Check functionality of exporting data parts to S3 storage from different clusters.""" + pass + # Scenario(run=) \ No newline at end of file diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py new file mode 100644 index 000000000..db09d0d7a --- /dev/null +++ b/s3/tests/export_part/error_handling.py @@ -0,0 +1,34 @@ +from testflows.core import * +from testflows.asserts import error + +from s3.tests.export_part.steps import * +from helpers.tables import * + + +@TestScenario +def invalid_part_name(self): + """Check that exporting a non-existent part returns the correct error.""" + + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() + + with When("I insert random test data into the source table"): + source.insert_test_data() # default row_count=10, cardinality=1 + + with And("I create an invalid part name"): + invalid_part_name = "in_va_lid_part" + + with Then("I try to export the invalid part and expect an error"): + results = export_part(parts=[invalid_part_name], source=source, destination=destination, exitcode=1) + assert len(results) == 1, error() + # note(f"Result: {results[0].output}") + assert results[0].exitcode == 233, error() + assert f"Unexpected part name: {invalid_part_name}" in results[0].output, error() + + +@TestFeature +@Name("error handling") +def feature(self): + """Check correct error handling when exporting parts.""" + + Scenario(run=invalid_part_name) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index d2ea9e945..e27ed3f27 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -1,115 +1,20 @@ -import random - -from testflows.core import * -from testflows.asserts import error - -from helpers.tables import * -from s3.tests.common import * -from s3.tests.export_part.steps import * - - -@TestScenario -def sanity(self): - """Check that ClickHouse can export data parts to S3 storage.""" - - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 - - with And("I get a list of parts for source table"): - source_parts = source.get_parts() - - with And("I read current export events"): - events_before = export_events() - - with And("I export parts to the destination table"): - export_part(parts=source_parts, source=source, destination=destination) - - with Then("I check system.events that all exports are successful"): - events_after = export_events() - total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) - total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) - assert total_exports_after == total_exports_before + len(source_parts), error() - - with And("I read back data and assert destination matches source"): - destination_data = destination.select_ordered_by_partition_and_index() - source_data = source.select_ordered_by_partition_and_index() - assert destination_data == source_data, error() - - -@TestScenario -def invalid_part_name(self): - """Check that exporting a non-existent part returns the correct error.""" - - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 - - with And("I create an invalid part name"): - invalid_part_name = "in_va_lid_part" - - with Then("I try to export the invalid part and expect an error"): - results = export_part(parts=[invalid_part_name], source=source, destination=destination, exitcode=1) - assert len(results) == 1, error() - # note(f"Result: {results[0].output}") - assert results[0].exitcode == 233, error() - assert f"Unexpected part name: {invalid_part_name}" in results[0].output, error() - - -@TestScenario -def duplicate_exports(self): - """Check that duplicate export attempts are properly tracked in system.events.""" - - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 - - with And("I get a list of parts for source table"): - source_parts = source.get_parts() - test_part = source_parts[1] - - with And("I read initial export events"): - events_initial = export_events() - initial_exports = events_initial.get("PartsExports", 0) - initial_duplicates = events_initial.get("PartsExportDuplicated", 0) - - with When("I export the same part twice"): - export_part(parts=[test_part], source=source, destination=destination) - export_part(parts=[test_part], source=source, destination=destination) - - with Then("I check system.events for duplicate tracking"): - events_final = export_events() - final_exports = events_final.get("PartsExports", 0) - final_duplicates = events_final.get("PartsExportDuplicated", 0) - - # 1 successful export - assert final_exports - initial_exports == 1, error() - # 1 of the exports was counted as a duplicate - assert final_duplicates - initial_duplicates == 1, error() - - -@TestOutline(Feature) -@Requirements( - # TBD -) -def outline(self): - """Run export part scenarios.""" - - for scenario in loads(current_module(), Scenario): - Scenario(run=scenario, flags=TE) - - -@TestFeature -@Requirements() -@Name("export part") -def minio(self, uri, bucket_prefix): - self.context.uri_base = uri - self.context.bucket_prefix = bucket_prefix - - outline() +from testflows.core import * + + +@TestFeature +@Name("export parts") +def minio(self, uri, bucket_prefix): + """Run features from the export parts suite.""" + + self.context.uri_base = uri + self.context.bucket_prefix = bucket_prefix + + self.context.node_1 = self.context.cluster.node("clickhouse1") + self.context.node_2 = self.context.cluster.node("clickhouse2") + self.context.node_3 = self.context.cluster.node("clickhouse3") + self.context.nodes = [self.context.node_1, self.context.node_2, self.context.node_3] + + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + Feature(run=load("s3.tests.export_part.clusters", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py new file mode 100644 index 000000000..0cc754f31 --- /dev/null +++ b/s3/tests/export_part/sanity.py @@ -0,0 +1,44 @@ +from testflows.core import * +from testflows.asserts import error + +from s3.tests.export_part.steps import * +from helpers.tables import * + + +@TestScenario +def sanity(self): + """Check that ClickHouse can export data parts to S3 storage.""" + + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() + + with When("I insert random test data into the source table"): + source.insert_test_data() # default row_count=10, cardinality=1 + + with And("I get a list of parts for source table"): + source_parts = source.get_parts() + + with And("I read current export events"): + events_before = export_events() + + with And("I export parts to the destination table"): + export_part(parts=source_parts, source=source, destination=destination) + + with Then("I check system.events that all exports are successful"): + events_after = export_events() + total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) + total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) + assert total_exports_after == total_exports_before + len(source_parts), error() + + with And("I read back data and assert destination matches source"): + destination_data = destination.select_ordered_by_partition_and_index() + source_data = source.select_ordered_by_partition_and_index() + assert destination_data == source_data, error() + + +@TestFeature +@Name("sanity") +def feature(self): + """Check basic functionality of exporting data parts to S3 storage.""" + + Scenario(run=sanity) \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 01c8a6dfc..277b4504b 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -49,9 +49,10 @@ def export_events(self): @TestStep(When) -def export_part(self, parts, source, destination, exitcode=0): +def export_part(self, parts, source, destination, exitcode=0, node=None): """Alter export of parts.""" - node = self.context.node + if node is None: + node = self.context.node no_checks = exitcode != 0 results = [] @@ -67,6 +68,7 @@ def export_part(self, parts, source, destination, exitcode=0): return results + @TestStep(When) def create_source_table( self, columns=None, partition_by=None, order_by=None, engine=None diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py new file mode 100644 index 000000000..a08293359 --- /dev/null +++ b/s3/tests/export_part/system_monitoring.py @@ -0,0 +1,46 @@ +from testflows.core import * +from testflows.asserts import error + +from s3.tests.export_part.steps import * + + +@TestScenario +def duplicate_exports(self): + """Check duplicate export attempts are properly tracked in system.events.""" + + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() + + with When("I insert random test data into the source table"): + source.insert_test_data() # default row_count=10, cardinality=1 + + with And("I get a list of parts for source table"): + source_parts = source.get_parts() + test_part = source_parts[1] + + with And("I read initial export events"): + events_initial = export_events() + initial_exports = events_initial.get("PartsExports", 0) + initial_duplicates = events_initial.get("PartsExportDuplicated", 0) + + with When("I export the same part twice"): + export_part(parts=[test_part], source=source, destination=destination) + export_part(parts=[test_part], source=source, destination=destination) + + with Then("I check system.events for duplicate tracking"): + events_final = export_events() + final_exports = events_final.get("PartsExports", 0) + final_duplicates = events_final.get("PartsExportDuplicated", 0) + + # 1 successful export + assert final_exports - initial_exports == 1, error() + # 1 of the exports was counted as a duplicate + assert final_duplicates - initial_duplicates == 1, error() + + +@TestFeature +@Name("system monitoring") +def feature(self): + """Check system monitoring of export events.""" + + Scenario(run=duplicate_exports) \ No newline at end of file From 3ef4fff69b44c9ed22f38d877e187927345b5937 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Thu, 23 Oct 2025 14:45:33 -0400 Subject: [PATCH 14/99] Add tests, clusters broken right now --- s3/regression.py | 22 ++-- s3/tests/export_part/clusters.py | 15 --- s3/tests/export_part/clusters_and_nodes.py | 128 +++++++++++++++++++++ s3/tests/export_part/feature.py | 13 +-- s3/tests/export_part/sanity.py | 73 +++++++++++- s3/tests/export_part/steps.py | 31 +++-- s3/tests/export_part/system_monitoring.py | 5 +- 7 files changed, 237 insertions(+), 50 deletions(-) delete mode 100644 s3/tests/export_part/clusters.py create mode 100644 s3/tests/export_part/clusters_and_nodes.py diff --git a/s3/regression.py b/s3/regression.py index 77717e48d..cc4fe9500 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -539,6 +539,8 @@ def minio_regression( self.context.cluster = cluster self.context.node = cluster.node("clickhouse1") + self.context.node2 = cluster.node("clickhouse2") + self.context.node3 = cluster.node("clickhouse3") with And("I have a minio client"): start_minio(access_key=root_user, secret_key=root_password) @@ -549,18 +551,18 @@ def minio_regression( for node in nodes["clickhouse"]: experimental_analyzer(node=cluster.node(node), with_analyzer=with_analyzer) - with And("allow higher cpu_wait_ratio "): - if check_clickhouse_version(">=25.4")(self): - allow_higher_cpu_wait_ratio( - min_os_cpu_wait_time_ratio_to_throw=15, - max_os_cpu_wait_time_ratio_to_throw=25, - ) + # with And("allow higher cpu_wait_ratio "): + # if check_clickhouse_version(">=25.4")(self): + # allow_higher_cpu_wait_ratio( + # min_os_cpu_wait_time_ratio_to_throw=15, + # max_os_cpu_wait_time_ratio_to_throw=25, + # ) - with And("I add all possible clusters for nodes"): - add_clusters_for_nodes(nodes=nodes["clickhouse"], modify=True) + # with And("I add all possible clusters for nodes"): + # add_clusters_for_nodes(nodes=nodes["clickhouse"], modify=True) - with And("I get all possible clusters for nodes"): - self.context.clusters = get_clusters_for_nodes(nodes=nodes["clickhouse"]) + # with And("I get all possible clusters for nodes"): + # self.context.clusters = get_clusters_for_nodes(nodes=nodes["clickhouse"]) with Feature("part 1"): Feature(test=load("s3.tests.sanity", "minio"))(uri=uri_bucket_file) diff --git a/s3/tests/export_part/clusters.py b/s3/tests/export_part/clusters.py deleted file mode 100644 index 0f8d54995..000000000 --- a/s3/tests/export_part/clusters.py +++ /dev/null @@ -1,15 +0,0 @@ -from testflows.core import * -from testflows.asserts import error - -from s3.tests.export_part.steps import * - - -# @TestScenario - - -@TestFeature -@Name("clusters") -def feature(self): - """Check functionality of exporting data parts to S3 storage from different clusters.""" - pass - # Scenario(run=) \ No newline at end of file diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py new file mode 100644 index 000000000..ff7fef1e3 --- /dev/null +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -0,0 +1,128 @@ +import random +from itertools import combinations + +from testflows.core import * +from testflows.asserts import error + +from s3.tests.export_part.steps import * + + +@TestScenario +def different_nodes_same_destination(self, cluster, node1, node2): + """Test export part from different nodes to same S3 destination in a given cluster.""" + + with Given("I create tables on different nodes"): + source1, shared_destination = create_source_and_destination_tables(cluster=cluster, node=node1) + source2, _ = create_source_and_destination_tables(cluster=cluster, node=node2) + + with When("I insert test data into the source tables"): + source1.insert_test_data(random=random.Random(1), node=node1) + source2.insert_test_data(random=random.Random(2), node=node2) + + with And("I export parts from both nodes"): + parts1 = source1.get_parts(node=node1) + parts2 = source2.get_parts(node=node2) + events_before_node1 = export_events(node=node1) + events_before_node2 = export_events(node=node2) + export_part(parts=parts1, source=source1, destination=shared_destination) + export_part(parts=parts2, source=source2, destination=shared_destination) + + with Then("I check system.events that all exports are successful"): + events_after_node1 = export_events(node=node1) + events_after_node2 = export_events(node=node2) + total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) + total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) + assert total_exports_after == total_exports_before + len(parts1) + len(parts2), error() + + with And("I verify data from both nodes appear in S3"): + destination_data = shared_destination.select_ordered_by_partition_and_index() + for part in parts1: + assert part.split("_")[0] in destination_data, error() + for part in parts2: + assert part.split("_")[0] in destination_data, error() + + +@TestScenario +def different_nodes_different_destinations(self, cluster, node1, node2): + """Test export part from different nodes to different S3 destinations.""" + + with Given("I create tables on different nodes with same part names"): + source1, destination1 = create_source_and_destination_tables(cluster=cluster, node=node1) + source2, destination2 = create_source_and_destination_tables(cluster=cluster, node=node2) + + with When("I insert test data into the source tables"): + source1.insert_test_data(random=random.Random(1)) + source2.insert_test_data(random=random.Random(2)) + + with And("I export parts from both nodes to separate destinations"): + parts1 = source1.get_parts() + parts2 = source2.get_parts() + events_before = export_events() + export_part(parts=parts1, source=source1, destination=destination1) + export_part(parts=parts2, source=source2, destination=destination2) + + with Then("I check system.events that all exports are successful"): + events_after = export_events() + total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) + total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) + assert total_exports_after == total_exports_before + len(parts1) + len(parts2), error() + + with And("I verify data from both nodes appear in separate destinations"): + data1 = destination1.select_ordered_by_partition_and_index() + data2 = destination2.select_ordered_by_partition_and_index() + + with By("Checking data from both nodes appear in the right destinations"): + for part in parts1: + assert part.split("_")[0] in data1, error() + for part in parts2: + assert part.split("_")[0] in data2, error() + + with And("Checking data from both nodes do not appear in the wrong destinations"): + unique_parts1 = list(set(parts1) - set(parts2)) + unique_parts2 = list(set(parts2) - set(parts1)) + for part in unique_parts1: + assert part.split("_")[0] not in data2, error() + for part in unique_parts2: + assert part.split("_")[0] not in data1, error() + + +# I need to get the nodes from a cluster; is this the right way to do it? +def get_cluster_nodes(cluster, node=None): + """Get all nodes in a cluster.""" + + if node is None: + node = current().context.node + + result = node.query( + f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", + exitcode=0 + ) + + nodes = [line.strip() for line in result.output.splitlines() if line.strip()] + return nodes + + +@TestFeature +@Name("clusters and nodes") +def feature(self): + """Check functionality of exporting data parts to S3 storage from different clusters and nodes.""" + + clusters = [ + # "sharded_cluster", + # "replicated_cluster", + # "one_shard_cluster", + # "sharded_cluster12", + # "one_shard_cluster12", + "sharded_cluster23", + # "one_shard_cluster23", + ] + + for cluster in clusters: + node_names = get_cluster_nodes(cluster=cluster) + + for node1_name, node2_name in combinations(node_names, 2): + node1 = self.context.cluster.node(node1_name) + node2 = self.context.cluster.node(node2_name) + note(f"Testing {cluster} with nodes {node1_name} and {node2_name}") + # different_nodes_same_destination(cluster=cluster, node1=node1, node2=node2) + # different_nodes_different_destinations(cluster=cluster, node1=node1, node2=node2) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index e27ed3f27..130e2282b 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -9,12 +9,7 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - self.context.node_1 = self.context.cluster.node("clickhouse1") - self.context.node_2 = self.context.cluster.node("clickhouse2") - self.context.node_3 = self.context.cluster.node("clickhouse3") - self.context.nodes = [self.context.node_1, self.context.node_2, self.context.node_3] - - Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) - Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 0cc754f31..818265914 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -5,15 +5,19 @@ from helpers.tables import * +# TODO: Large data export? +# But if I add too many rows, there'll be too many partitions given the current implementation -> DB ERROR + + @TestScenario -def sanity(self): +def source_matches_destination(self, engine=None, row_count=10, cardinality=1): """Check that ClickHouse can export data parts to S3 storage.""" with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() + source, destination = create_source_and_destination_tables(engine=engine) with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + source.insert_test_data(row_count=row_count, cardinality=cardinality) with And("I get a list of parts for source table"): source_parts = source.get_parts() @@ -36,9 +40,70 @@ def sanity(self): assert destination_data == source_data, error() +@TestSketch(Scenario) +@Flags(TE) +def combinations(self): + """Test different combinations of engines, row counts, and cardinalities.""" + + engines = [ + "MergeTree", + "ReplicatedMergeTree", + "ReplacingMergeTree", + "SummingMergeTree", + "AggregatingMergeTree", + ] + row_counts = [1, 10] + cardinalities = [1, 10] + + source_matches_destination( + engine=either(*engines), + row_count=either(*row_counts), + cardinality=either(*cardinalities) + ) + + +@TestScenario +def multiple_parts(self): + """Test exporting multiple parts in a single operation.""" + + with Given("I create source and destination tables"): + source, destination = create_source_and_destination_tables() + + with When("I insert data to create multiple parts"): + for i in range(5): + source.insert_test_data() + + with And("I get all parts and export them"): + source_parts = source.get_parts() + export_part(parts=source_parts, source=source, destination=destination) + + with Then("I verify all data was exported correctly"): + source_data = source.select_ordered_by_partition_and_index() + destination_data = destination.select_ordered_by_partition_and_index() + assert source_data == destination_data, error() + + +@TestScenario +def empty_table(self): + """Test exporting from an empty table.""" + + with Given("I create empty source and destination tables"): + source, destination = create_source_and_destination_tables() + + with When("I check for parts in empty table"): + source_parts = source.get_parts() + assert len(source_parts) == 0, error() + + with Then("I verify destination is also empty"): + dest_count = destination.query("SELECT count() FROM " + destination.name) + assert dest_count == "0", error() + + @TestFeature @Name("sanity") def feature(self): """Check basic functionality of exporting data parts to S3 storage.""" - Scenario(run=sanity) \ No newline at end of file + Scenario(run=empty_table) + Scenario(run=multiple_parts) + Scenario(run=combinations) \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 277b4504b..9d193cfbe 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -7,19 +7,22 @@ @TestStep(Given) -def create_source_and_destination_tables(self, stop_merges=True): +def create_source_and_destination_tables(self, engine=None, columns=None, partition_by=None, order_by=None, node=None, cluster=None, stop_merges=True): """Create source and destination tables.""" + create_temp_bucket() - source = create_source_table() + source = create_source_table(engine=engine, columns=columns, partition_by=partition_by, order_by=order_by, node=node, cluster=cluster) destination = create_destination_table(source=source) if stop_merges: source.stop_merges() + return source, destination @TestStep(Given) def create_temp_bucket(self, uri=None, bucket_prefix=None): """Create temporary s3 bucket.""" + if uri is None: uri = self.context.uri_base @@ -35,9 +38,12 @@ def create_temp_bucket(self, uri=None, bucket_prefix=None): @TestStep(When) -def export_events(self): +def export_events(self, node=None): """Get the number of successful parts exports from the system.events table.""" - node = self.context.node + + if node is None: + node = self.context.node + output = node.query( "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0, @@ -51,6 +57,7 @@ def export_events(self): @TestStep(When) def export_part(self, parts, source, destination, exitcode=0, node=None): """Alter export of parts.""" + if node is None: node = self.context.node @@ -70,11 +77,11 @@ def export_part(self, parts, source, destination, exitcode=0, node=None): @TestStep(When) -def create_source_table( - self, columns=None, partition_by=None, order_by=None, engine=None -): +def create_source_table(self, engine=None, columns=None, partition_by=None, order_by=None, node=None, cluster=None): """Create a source table.""" + if engine is None: + engine = "MergeTree" if columns is None: columns = [ Column(name="p", datatype=UInt16()), @@ -84,15 +91,17 @@ def create_source_table( partition_by = columns[0].name if order_by is None: order_by = "tuple()" - if engine is None: - engine = "MergeTree" + if cluster is None: + cluster = "one_shard_cluster" source = create_table( name="source_table_" + getuid(), + engine=engine, columns=columns, partition_by=partition_by, order_by=order_by, - engine=engine, + cluster=cluster, + node=node, ) return source @@ -101,7 +110,9 @@ def create_source_table( @TestStep(When) def create_destination_table(self, source, engine=None): """Create a destination table.""" + name = "destination_table_" + getuid() + if engine is None: engine = f""" S3( diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index a08293359..b8f92786c 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -32,9 +32,10 @@ def duplicate_exports(self): final_exports = events_final.get("PartsExports", 0) final_duplicates = events_final.get("PartsExportDuplicated", 0) - # 1 successful export + with By("Checking we have 1 successful export"): assert final_exports - initial_exports == 1, error() - # 1 of the exports was counted as a duplicate + + with And("Checking we have 1 duplicate export"): assert final_duplicates - initial_duplicates == 1, error() From 03597fdaa9f0a295434c2a13fb2d056bdee29fff Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Fri, 24 Oct 2025 16:14:37 -0400 Subject: [PATCH 15/99] clean tables.py --- helpers/tables.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/helpers/tables.py b/helpers/tables.py index aed670df5..94c6a9582 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -402,34 +402,6 @@ def insert_test_data( return result, values return result - def get_parts(self, node=None): - """Get all parts for a table.""" - if node is None: - node = current().context.node - - output = node.query( - f"SELECT name FROM system.parts WHERE table = '{self.name}'", exitcode=0 - ).output - return [row.strip() for row in output.splitlines()] - - def stop_merges(self, node=None): - """Stop merges for a table.""" - if node is None: - node = current().context.node - - node.query(f"SYSTEM STOP MERGES {self.name}", exitcode=0) - - def query(self, query, node=None): - """Query data from a table.""" - if node is None: - node = current().context.node - - return node.query(query, exitcode=0).output - - def select_ordered_by_partition_and_index(self, node=None): - """Select all data from a table ordered by partition and index columns.""" - return self.query(f"SELECT * FROM {self.name} ORDER BY p, i", node=node) - @TestStep(Given) def create_table( From 52d7aa6b4ae8e93bbaa6ca3b043acb753f9983e6 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Fri, 24 Oct 2025 16:15:53 -0400 Subject: [PATCH 16/99] add arguments to create.py functions --- helpers/create.py | 69 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index 8472a9019..df2309283 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -21,6 +21,7 @@ def create_table( comment=None, settings=None, partition_by=None, + stop_merges=False, ): """ Generates a query to create a table in ClickHouse. @@ -107,8 +108,12 @@ def create_table( query += ";" + if stop_merges: + query += f" SYSTEM STOP MERGES {table_name};" + node.query(query) yield + finally: with Finally(f"drop the table {table_name}"): node.query(f"DROP TABLE IF EXISTS {table_name}") @@ -127,6 +132,8 @@ def create_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = None, + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the MergeTree engine.""" create_table( @@ -139,6 +146,8 @@ def create_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -153,6 +162,8 @@ def create_replacing_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = None, + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the ReplacingMergeTree engine.""" create_table( @@ -165,6 +176,8 @@ def create_replacing_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -179,6 +192,8 @@ def create_summing_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = None, + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the SummingMergeTree engine.""" create_table( @@ -191,6 +206,8 @@ def create_summing_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -205,6 +222,8 @@ def create_aggregating_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = None, + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the AggregatingMergeTree engine.""" create_table( @@ -217,6 +236,8 @@ def create_aggregating_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -232,6 +253,8 @@ def create_collapsing_merge_tree_table( order_by: str = "tuple()", partition_by: str = None, sign: str = "Sign", + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the CollapsingMergeTree engine. @@ -248,6 +271,8 @@ def create_collapsing_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -264,6 +289,8 @@ def create_versioned_collapsing_merge_tree_table( partition_by: str = None, sign: str = "Sign", version: str = "Version", + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the VersionedCollapsingMergeTree engine. @@ -281,6 +308,8 @@ def create_versioned_collapsing_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -296,6 +325,8 @@ def create_graphite_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = None, + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the GraphiteMergeTree engine. @@ -312,6 +343,8 @@ def create_graphite_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @@ -326,6 +359,8 @@ def create_replicated_merge_tree_table( primary_key=None, order_by: str = "tuple()", partition_by: str = "p", + cluster: str = None, + stop_merges: bool = False, ): """Create a table with the MergeTree engine.""" if columns is None: @@ -345,15 +380,17 @@ def create_replicated_merge_tree_table( db=db, comment=comment, partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) @TestStep(Given) -def partitioned_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a MergeTree table partitioned by a specific column.""" with By(f"creating a partitioned {table_name} table with a MergeTree engine"): create_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by + table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -362,14 +399,14 @@ def partitioned_merge_tree_table(self, table_name, partition_by, columns): @TestStep(Given) def partitioned_replicated_merge_tree_table( - self, table_name, partition_by, columns=None + self, table_name, partition_by, columns=None, cluster=None, stop_merges=False ): """Create a ReplicatedMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a ReplicatedMergeTree engine" ): create_replicated_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by + table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -377,13 +414,13 @@ def partitioned_replicated_merge_tree_table( @TestStep(Given) -def partitioned_replacing_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_replacing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a ReplacingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a ReplacingMergeTree engine" ): create_replacing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by + table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -391,13 +428,13 @@ def partitioned_replacing_merge_tree_table(self, table_name, partition_by, colum @TestStep(Given) -def partitioned_summing_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_summing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a SummingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a SummingMergeTree engine" ): create_aggregating_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by + table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -405,13 +442,13 @@ def partitioned_summing_merge_tree_table(self, table_name, partition_by, columns @TestStep(Given) -def partitioned_collapsing_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_collapsing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a CollapsingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a CollapsingMergeTree engine" ): create_collapsing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, sign="p" + table_name=table_name, columns=columns, partition_by=partition_by, sign="p", cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -422,7 +459,7 @@ def partitioned_collapsing_merge_tree_table(self, table_name, partition_by, colu @TestStep(Given) def partitioned_versioned_collapsing_merge_tree_table( - self, table_name, partition_by, columns + self, table_name, partition_by, columns, cluster=None, stop_merges=False ): """Create a VersionedCollapsingMergeTree table partitioned by a specific column.""" with By( @@ -434,6 +471,8 @@ def partitioned_versioned_collapsing_merge_tree_table( partition_by=partition_by, sign="p", version="i", + cluster=cluster, + stop_merges=stop_merges, ) with And("populating it with the data needed to create multiple partitions"): @@ -443,13 +482,13 @@ def partitioned_versioned_collapsing_merge_tree_table( @TestStep(Given) -def partitioned_aggregating_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_aggregating_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a AggregatingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a AggregatingMergeTree engine" ): create_summing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by + table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) with And("populating it with the data needed to create multiple partitions"): @@ -457,7 +496,7 @@ def partitioned_aggregating_merge_tree_table(self, table_name, partition_by, col @TestStep(Given) -def partitioned_graphite_merge_tree_table(self, table_name, partition_by, columns): +def partitioned_graphite_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): """Create a GraphiteMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a GraphiteMergeTree engine" @@ -467,6 +506,8 @@ def partitioned_graphite_merge_tree_table(self, table_name, partition_by, column columns=columns, partition_by=partition_by, config="graphite_rollup_example", + cluster=cluster, + stop_merges=stop_merges, ) with And("populating it with the data needed to create multiple partitions"): From 4d1a02512b475b039bbfd807226f84909005c334 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Fri, 24 Oct 2025 16:16:39 -0400 Subject: [PATCH 17/99] steps.py rewrite --- s3/tests/export_part/steps.py | 166 +++++++++++++--------------------- 1 file changed, 65 insertions(+), 101 deletions(-) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 9d193cfbe..3c531bf48 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -1,136 +1,100 @@ import json from testflows.core import * +from testflows.asserts import error from helpers.common import getuid -from helpers.tables import * +from helpers.create import * from s3.tests.common import temporary_bucket_path @TestStep(Given) -def create_source_and_destination_tables(self, engine=None, columns=None, partition_by=None, order_by=None, node=None, cluster=None, stop_merges=True): - """Create source and destination tables.""" - - create_temp_bucket() - source = create_source_table(engine=engine, columns=columns, partition_by=partition_by, order_by=order_by, node=node, cluster=cluster) - destination = create_destination_table(source=source) - if stop_merges: - source.stop_merges() - - return source, destination +def create_temp_bucket(self): + """Create temporary S3 bucket.""" + temp_s3_path = temporary_bucket_path( + bucket_prefix=f"{self.context.bucket_prefix}/export_part" + ) -@TestStep(Given) -def create_temp_bucket(self, uri=None, bucket_prefix=None): - """Create temporary s3 bucket.""" + self.context.uri = f"{self.context.uri_base}export_part/{temp_s3_path}/" + # Delete the next line if the context var is never used + # self.context.bucket_path = f"{self.context.bucket_prefix}/export_part/{temp_s3_path}" - if uri is None: - uri = self.context.uri_base - - if bucket_prefix is None: - bucket_prefix = self.context.bucket_prefix - temp_s3_path = temporary_bucket_path( - bucket_prefix=f"{bucket_prefix}/export_part" - ) +@TestStep(Given) +def create_s3_table(self, table_name, cluster): + """Create a destination S3 table.""" + + table_name = f"{table_name}_{getuid()}" + engine = f""" + S3( + '{self.context.uri}', + '{self.context.access_key_id}', + '{self.context.secret_access_key}', + filename='{table_name}', + format='Parquet', + compression='auto', + partition_strategy='hive' + ) + """ - self.context.uri = f"{uri}export_part/{temp_s3_path}/" - self.context.bucket_path = f"{bucket_prefix}/export_part/{temp_s3_path}" + # TODO columns and partition_by are hardcoded for now, but i should make them configurable + create_table( + table_name=table_name, + columns=[{"name": "p", "type": "Int8"}, {"name": "i", "type": "UInt64"}], + partition_by="p", + engine=engine, + cluster=cluster, + ) @TestStep(When) -def export_events(self, node=None): - """Get the number of successful parts exports from the system.events table.""" +def get_parts(self, table_name, node): + """Get all parts for a table on a given node.""" - if node is None: - node = self.context.node - - output = node.query( - "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", - exitcode=0, - ).output - return { - row["name"]: int(row["value"]) - for row in [json.loads(row) for row in output.splitlines()] - } + output = node.query(f"SELECT name FROM system.parts WHERE table = '{table_name}'", exitcode=0).output + return [row.strip() for row in output.splitlines()] @TestStep(When) -def export_part(self, parts, source, destination, exitcode=0, node=None): - """Alter export of parts.""" +def select_all_ordered(self, table_name, node): + """Select all data from a table ordered by partition and index columns.""" + + output = node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output + return [row.strip() for row in output.splitlines()] - if node is None: - node = self.context.node +@TestStep(When) +def export_parts(self, source_table, destination_table, node, parts=None, exitcode=0): + """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" + + if parts is None: + parts = get_parts(source_table, node) no_checks = exitcode != 0 - results = [] - # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug for part in parts: - results.append(node.query( - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source.name} EXPORT PART '{part}' TO TABLE {destination.name}", + node.query(# we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", # settings=[("allow_experimental_export_merge_tree_part", 1)], exitcode=exitcode, no_checks=no_checks - )) - - return results + ) +# TODO find the simplest way to parse the output @TestStep(When) -def create_source_table(self, engine=None, columns=None, partition_by=None, order_by=None, node=None, cluster=None): - """Create a source table.""" - - if engine is None: - engine = "MergeTree" - if columns is None: - columns = [ - Column(name="p", datatype=UInt16()), - Column(name="i", datatype=UInt64()), - ] - if partition_by is None: - partition_by = columns[0].name - if order_by is None: - order_by = "tuple()" - if cluster is None: - cluster = "one_shard_cluster" - - source = create_table( - name="source_table_" + getuid(), - engine=engine, - columns=columns, - partition_by=partition_by, - order_by=order_by, - cluster=cluster, - node=node, - ) - - return source +def get_export_events(self, node): + """Get the export data from the system.events table of a given node.""" + output = node.query("SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0).output + # return {row.name: int(row.value) for row in json.loads(output)} + # return [json.loads(row) for row in output.splitlines()] + return output -@TestStep(When) -def create_destination_table(self, source, engine=None): - """Create a destination table.""" - - name = "destination_table_" + getuid() - - if engine is None: - engine = f""" - S3( - '{self.context.uri}', - '{self.context.access_key_id}', - '{self.context.secret_access_key}', - filename='{name}', - format='Parquet', - compression='auto', - partition_strategy='hive' - ) - """ - destination = create_table( - name=name, - columns=source.columns, - partition_by=source.partition_by, - engine=engine, - ) +@TestStep(Then) +def source_matches_destination(self, source_table, destination_table, source_node, destination_node): + """Check that source and destination table data matches.""" - return destination + source_data = select_all_ordered(source_table, source_node) + destination_data = select_all_ordered(destination_table, destination_node) + assert source_data == destination_data, error() \ No newline at end of file From 98d15398aefd42d463a166ce1cd1242fc3c1b62b Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Fri, 24 Oct 2025 22:31:37 -0400 Subject: [PATCH 18/99] configurable number of partitions and parts in create.py --- helpers/create.py | 76 ++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index df2309283..9e5adf322 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -386,20 +386,23 @@ def create_replicated_merge_tree_table( @TestStep(Given) -def partitioned_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 +): """Create a MergeTree table partitioned by a specific column.""" with By(f"creating a partitioned {table_name} table with a MergeTree engine"): create_merge_tree_table( table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) @TestStep(Given) def partitioned_replicated_merge_tree_table( - self, table_name, partition_by, columns=None, cluster=None, stop_merges=False + self, table_name, partition_by, columns=None, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 ): """Create a ReplicatedMergeTree table partitioned by a specific column.""" with By( @@ -409,12 +412,15 @@ def partitioned_replicated_merge_tree_table( table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) @TestStep(Given) -def partitioned_replacing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_replacing_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 +): """Create a ReplacingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a ReplacingMergeTree engine" @@ -423,12 +429,15 @@ def partitioned_replacing_merge_tree_table(self, table_name, partition_by, colum table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) @TestStep(Given) -def partitioned_summing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_summing_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 +): """Create a SummingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a SummingMergeTree engine" @@ -437,12 +446,15 @@ def partitioned_summing_merge_tree_table(self, table_name, partition_by, columns table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) @TestStep(Given) -def partitioned_collapsing_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_collapsing_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=1, number_of_parts=1 +): """Create a CollapsingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a CollapsingMergeTree engine" @@ -451,15 +463,16 @@ def partitioned_collapsing_merge_tree_table(self, table_name, partition_by, colu table_name=table_name, columns=columns, partition_by=partition_by, sign="p", cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64( - table_name=table_name, number_of_partitions=1 - ) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64( + table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts + ) @TestStep(Given) def partitioned_versioned_collapsing_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=1, number_of_parts=1 ): """Create a VersionedCollapsingMergeTree table partitioned by a specific column.""" with By( @@ -475,14 +488,17 @@ def partitioned_versioned_collapsing_merge_tree_table( stop_merges=stop_merges, ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64( - table_name=table_name, number_of_partitions=1 - ) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64( + table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts + ) @TestStep(Given) -def partitioned_aggregating_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_aggregating_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 +): """Create a AggregatingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a AggregatingMergeTree engine" @@ -491,12 +507,15 @@ def partitioned_aggregating_merge_tree_table(self, table_name, partition_by, col table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) @TestStep(Given) -def partitioned_graphite_merge_tree_table(self, table_name, partition_by, columns, cluster=None, stop_merges=False): +def partitioned_graphite_merge_tree_table( + self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 +): """Create a GraphiteMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a GraphiteMergeTree engine" @@ -510,5 +529,6 @@ def partitioned_graphite_merge_tree_table(self, table_name, partition_by, column stop_merges=stop_merges, ) - with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name) + if populate: + with And("populating it with the data needed to create multiple partitions"): + create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) From b6e28f8de574d1e787cf49b06a04459ee2466ce5 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Fri, 24 Oct 2025 22:33:57 -0400 Subject: [PATCH 19/99] clean steps, sanity almost working --- s3/tests/export_part/feature.py | 8 +- s3/tests/export_part/sanity.py | 147 +++++++++++----------- s3/tests/export_part/steps.py | 14 ++- s3/tests/export_part/system_monitoring.py | 2 + 4 files changed, 90 insertions(+), 81 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 130e2282b..10dca1849 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -8,8 +8,12 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix + self.context.default_columns = [ + {"name": "p", "type": "Int8"}, + {"name": "i", "type": "UInt64"}, + ] - # Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) \ No newline at end of file diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 818265914..ea36a73e0 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -2,101 +2,100 @@ from testflows.asserts import error from s3.tests.export_part.steps import * -from helpers.tables import * +from helpers.create import * -# TODO: Large data export? -# But if I add too many rows, there'll be too many partitions given the current implementation -> DB ERROR - +# TODO large data export? or maybe that should be in a different file @TestScenario -def source_matches_destination(self, engine=None, row_count=10, cardinality=1): - """Check that ClickHouse can export data parts to S3 storage.""" - - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables(engine=engine) - - with When("I insert random test data into the source table"): - source.insert_test_data(row_count=row_count, cardinality=cardinality) - - with And("I get a list of parts for source table"): - source_parts = source.get_parts() - - with And("I read current export events"): - events_before = export_events() - - with And("I export parts to the destination table"): - export_part(parts=source_parts, source=source, destination=destination) - - with Then("I check system.events that all exports are successful"): - events_after = export_events() - total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) - total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) - assert total_exports_after == total_exports_before + len(source_parts), error() - - with And("I read back data and assert destination matches source"): - destination_data = destination.select_ordered_by_partition_and_index() - source_data = source.select_ordered_by_partition_and_index() - assert destination_data == source_data, error() +def configured_table(self, table_engine, number_of_partitions, number_of_parts): + """Test a specific combination of table engine, number of partitions, and number of parts.""" + + with Given("I create a populated source table and empty S3 table"): + table_engine( + table_name="source", + partition_by="p", + columns=self.context.default_columns, + stop_merges=True, + populate=True, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts(source_table="source", destination_table=s3_table_name, node=self.context.node) + + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) + + with Then("They should be the same"): + assert source_data == destination_data, error() @TestSketch(Scenario) @Flags(TE) -def combinations(self): - """Test different combinations of engines, row counts, and cardinalities.""" - - engines = [ - "MergeTree", - "ReplicatedMergeTree", - "ReplacingMergeTree", - "SummingMergeTree", - "AggregatingMergeTree", +def table_combos(self): + """Test various combinations of table engines, number of partitions, and number of parts.""" + + tables = [ + partitioned_merge_tree_table, + partitioned_replacing_merge_tree_table, + partitioned_summing_merge_tree_table, + # partitioned_collapsing_merge_tree_table, # Ask David if failing here is expected behaviour + partitioned_versioned_collapsing_merge_tree_table, + partitioned_aggregating_merge_tree_table, + # partitioned_graphite_merge_tree_table, # Ask David about "age and precision should only grow up" error ] - row_counts = [1, 10] - cardinalities = [1, 10] - - source_matches_destination( - engine=either(*engines), - row_count=either(*row_counts), - cardinality=either(*cardinalities) + # TODO expand combos + number_of_partitions = [5] + number_of_parts = [1] + + configured_table( + table_engine=either(*tables), + number_of_partitions=either(*number_of_partitions), + number_of_parts=either(*number_of_parts), ) @TestScenario -def multiple_parts(self): - """Test exporting multiple parts in a single operation.""" +def basic_table(self): + """Test exporting parts of a basic table.""" - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table(table_name="source", partition_by="p", columns=self.context.default_columns, stop_merges=True, populate=True) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - with When("I insert data to create multiple parts"): - for i in range(5): - source.insert_test_data() + with When("I export parts to the S3 table"): + export_parts(source_table="source", destination_table=s3_table_name, node=self.context.node) - with And("I get all parts and export them"): - source_parts = source.get_parts() - export_part(parts=source_parts, source=source, destination=destination) + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) - with Then("I verify all data was exported correctly"): - source_data = source.select_ordered_by_partition_and_index() - destination_data = destination.select_ordered_by_partition_and_index() + with Then("They should be the same"): assert source_data == destination_data, error() @TestScenario def empty_table(self): - """Test exporting from an empty table.""" - - with Given("I create empty source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I check for parts in empty table"): - source_parts = source.get_parts() - assert len(source_parts) == 0, error() + """Test exporting parts from an empty table.""" + + with Given("I create empty source and S3 tables"): + partitioned_merge_tree_table(table_name="empty_source", partition_by="p", columns=self.context.default_columns, stop_merges=True, populate=False) + s3_table_name = create_s3_table(table_name="empty_s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts(source_table="empty_source", destination_table=s3_table_name, node=self.context.node) - with Then("I verify destination is also empty"): - dest_count = destination.query("SELECT count() FROM " + destination.name) - assert dest_count == "0", error() + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="empty_source", node=self.context.node) + destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) + + with Then("They should be empty"): + assert source_data == "", error() + assert destination_data == "", error() @TestFeature @@ -105,5 +104,5 @@ def feature(self): """Check basic functionality of exporting data parts to S3 storage.""" Scenario(run=empty_table) - Scenario(run=multiple_parts) - Scenario(run=combinations) \ No newline at end of file + Scenario(run=basic_table) + Scenario(run=table_combos) \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 3c531bf48..9d41210d3 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -21,9 +21,12 @@ def create_temp_bucket(self): @TestStep(Given) -def create_s3_table(self, table_name, cluster): +def create_s3_table(self, table_name, cluster=None, create_new_bucket=False): """Create a destination S3 table.""" + if create_new_bucket: + create_temp_bucket() + table_name = f"{table_name}_{getuid()}" engine = f""" S3( @@ -40,12 +43,14 @@ def create_s3_table(self, table_name, cluster): # TODO columns and partition_by are hardcoded for now, but i should make them configurable create_table( table_name=table_name, - columns=[{"name": "p", "type": "Int8"}, {"name": "i", "type": "UInt64"}], + columns=self.context.default_columns, partition_by="p", engine=engine, cluster=cluster, ) + return table_name + @TestStep(When) def get_parts(self, table_name, node): @@ -59,8 +64,7 @@ def get_parts(self, table_name, node): def select_all_ordered(self, table_name, node): """Select all data from a table ordered by partition and index columns.""" - output = node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output - return [row.strip() for row in output.splitlines()] + return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output @TestStep(When) @@ -68,7 +72,7 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" if parts is None: - parts = get_parts(source_table, node) + parts = get_parts(table_name=source_table, node=node) no_checks = exitcode != 0 for part in parts: diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index b8f92786c..c089f5be8 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -4,6 +4,8 @@ from s3.tests.export_part.steps import * +# TODO checks on export_events should go here, not in sanity.py + @TestScenario def duplicate_exports(self): """Check duplicate export attempts are properly tracked in system.events.""" From ccb4cf7bfe31cad9e66f265e8047dc4631c33182 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Mon, 27 Oct 2025 17:25:21 +0400 Subject: [PATCH 20/99] fix collapsing merge tree --- alter/table/replace_partition/common.py | 20 ++++++++++++++++++++ helpers/create.py | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/alter/table/replace_partition/common.py b/alter/table/replace_partition/common.py index 59bad00a3..7bb86b8cb 100644 --- a/alter/table/replace_partition/common.py +++ b/alter/table/replace_partition/common.py @@ -9,6 +9,26 @@ from helpers.tables import create_table_partitioned_by_column, create_table, Column +@TestStep(Given) +def create_partitions_for_collapsing_merge_tree( + self, + table_name, + number_of_values=3, + number_of_partitions=5, + number_of_parts=1, + node=None, +): + """Insert random UInt64 values into a column and create multiple partitions based on the value of number_of_partitions.""" + if node is None: + node = self.context.node + + with By("Inserting random values into a column with uint64 datatype"): + for i in range(1, number_of_partitions + 1): + for parts in range(1, number_of_parts + 1): + node.query( + f"INSERT INTO {table_name} (p, i) SELECT {random.choice([-1, 1])}, rand64() FROM numbers({number_of_values})" + ) + @TestStep(Given) def create_partitions_with_random_uint64( self, diff --git a/helpers/create.py b/helpers/create.py index 9e5adf322..3ae32478b 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -1,6 +1,7 @@ from testflows.core import * -from alter.table.replace_partition.common import create_partitions_with_random_uint64 +from alter.table.replace_partition.common import create_partitions_with_random_uint64, \ + create_partitions_for_collapsing_merge_tree @TestStep(Given) @@ -465,7 +466,7 @@ def partitioned_collapsing_merge_tree_table( if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64( + create_partitions_for_collapsing_merge_tree( table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts ) From 79ded584eefa0b6ea95190fde04a18fde862deb0 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Mon, 27 Oct 2025 17:41:54 +0400 Subject: [PATCH 21/99] fix other engines --- s3/tests/export_part/sanity.py | 90 ++++++++++++++++++++++++++-------- s3/tests/export_part/steps.py | 30 ++++++++---- 2 files changed, 90 insertions(+), 30 deletions(-) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index ea36a73e0..fb80b3952 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -7,6 +7,20 @@ # TODO large data export? or maybe that should be in a different file + +def columns(): + partition_columns = [ + {"name": "p", "type": "Int8"}, + {"name": "i", "type": "UInt64"}, + {"name": "Path", "type": "String"}, + {"name": "Time", "type": "DateTime"}, + {"name": "Value", "type": "Float64"}, + {"name": "Timestamp", "type": "Int64"}, + ] + + return partition_columns + + @TestScenario def configured_table(self, table_engine, number_of_partitions, number_of_parts): """Test a specific combination of table engine, number of partitions, and number of parts.""" @@ -15,21 +29,29 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): table_engine( table_name="source", partition_by="p", - columns=self.context.default_columns, stop_merges=True, populate=True, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, + columns=columns(), + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, columns=columns() ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts to the S3 table"): - export_parts(source_table="source", destination_table=s3_table_name, node=self.context.node) - + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + with And("I read data from both tables"): source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) - + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + with Then("They should be the same"): assert source_data == destination_data, error() @@ -43,10 +65,10 @@ def table_combos(self): partitioned_merge_tree_table, partitioned_replacing_merge_tree_table, partitioned_summing_merge_tree_table, - # partitioned_collapsing_merge_tree_table, # Ask David if failing here is expected behaviour + partitioned_collapsing_merge_tree_table, partitioned_versioned_collapsing_merge_tree_table, partitioned_aggregating_merge_tree_table, - # partitioned_graphite_merge_tree_table, # Ask David about "age and precision should only grow up" error + partitioned_graphite_merge_tree_table, ] # TODO expand combos number_of_partitions = [5] @@ -62,18 +84,30 @@ def table_combos(self): @TestScenario def basic_table(self): """Test exporting parts of a basic table.""" - + with Given("I create a populated source table and empty S3 table"): - partitioned_merge_tree_table(table_name="source", partition_by="p", columns=self.context.default_columns, stop_merges=True, populate=True) + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=self.context.default_columns, + stop_merges=True, + populate=True, + ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - + with When("I export parts to the S3 table"): - export_parts(source_table="source", destination_table=s3_table_name, node=self.context.node) - + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + with And("I read data from both tables"): source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) - + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + with Then("They should be the same"): assert source_data == destination_data, error() @@ -83,15 +117,29 @@ def empty_table(self): """Test exporting parts from an empty table.""" with Given("I create empty source and S3 tables"): - partitioned_merge_tree_table(table_name="empty_source", partition_by="p", columns=self.context.default_columns, stop_merges=True, populate=False) + partitioned_merge_tree_table( + table_name="empty_source", + partition_by="p", + columns=self.context.default_columns, + stop_merges=True, + populate=False, + ) s3_table_name = create_s3_table(table_name="empty_s3", create_new_bucket=True) with When("I export parts to the S3 table"): - export_parts(source_table="empty_source", destination_table=s3_table_name, node=self.context.node) - + export_parts( + source_table="empty_source", + destination_table=s3_table_name, + node=self.context.node, + ) + with And("I read data from both tables"): - source_data = select_all_ordered(table_name="empty_source", node=self.context.node) - destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) + source_data = select_all_ordered( + table_name="empty_source", node=self.context.node + ) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) with Then("They should be empty"): assert source_data == "", error() @@ -105,4 +153,4 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) - Scenario(run=table_combos) \ No newline at end of file + Scenario(run=table_combos) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 9d41210d3..54729cecc 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -21,12 +21,17 @@ def create_temp_bucket(self): @TestStep(Given) -def create_s3_table(self, table_name, cluster=None, create_new_bucket=False): +def create_s3_table( + self, table_name, cluster=None, create_new_bucket=False, columns=None +): """Create a destination S3 table.""" - + if create_new_bucket: create_temp_bucket() + if columns is None: + columns = self.context.default_columns + table_name = f"{table_name}_{getuid()}" engine = f""" S3( @@ -43,7 +48,7 @@ def create_s3_table(self, table_name, cluster=None, create_new_bucket=False): # TODO columns and partition_by are hardcoded for now, but i should make them configurable create_table( table_name=table_name, - columns=self.context.default_columns, + columns=columns, partition_by="p", engine=engine, cluster=cluster, @@ -56,7 +61,9 @@ def create_s3_table(self, table_name, cluster=None, create_new_bucket=False): def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" - output = node.query(f"SELECT name FROM system.parts WHERE table = '{table_name}'", exitcode=0).output + output = node.query( + f"SELECT name FROM system.parts WHERE table = '{table_name}'", exitcode=0 + ).output return [row.strip() for row in output.splitlines()] @@ -76,11 +83,11 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco no_checks = exitcode != 0 for part in parts: - node.query(# we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug + node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", # settings=[("allow_experimental_export_merge_tree_part", 1)], exitcode=exitcode, - no_checks=no_checks + no_checks=no_checks, ) @@ -89,16 +96,21 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco def get_export_events(self, node): """Get the export data from the system.events table of a given node.""" - output = node.query("SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0).output + output = node.query( + "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + exitcode=0, + ).output # return {row.name: int(row.value) for row in json.loads(output)} # return [json.loads(row) for row in output.splitlines()] return output @TestStep(Then) -def source_matches_destination(self, source_table, destination_table, source_node, destination_node): +def source_matches_destination( + self, source_table, destination_table, source_node, destination_node +): """Check that source and destination table data matches.""" source_data = select_all_ordered(source_table, source_node) destination_data = select_all_ordered(destination_table, destination_node) - assert source_data == destination_data, error() \ No newline at end of file + assert source_data == destination_data, error() From 7471a8fa1fd7bae04e3dc155123e8f238e8ef4f1 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Mon, 27 Oct 2025 17:42:25 +0400 Subject: [PATCH 22/99] update graphite rollup example in s3 --- s3/configs/clickhouse/config.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3/configs/clickhouse/config.xml b/s3/configs/clickhouse/config.xml index 0fee8bc4b..c40c589a9 100644 --- a/s3/configs/clickhouse/config.xml +++ b/s3/configs/clickhouse/config.xml @@ -416,7 +416,7 @@ 86400 - 60 + 7200 From ff2fc53980566450f4199467d6ce19a06ae81324 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 27 Oct 2025 12:39:01 -0400 Subject: [PATCH 23/99] basic clusters tests --- helpers/create.py | 7 +- s3/tests/export_part/clusters_and_nodes.py | 144 +++++++-------------- s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/steps.py | 16 +++ 4 files changed, 70 insertions(+), 99 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index 3ae32478b..7fa1d4c77 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -64,7 +64,7 @@ def create_table( query += f"{table_name}" else: query += f" {table_name}" - + if cluster: query += f" ON CLUSTER {cluster}" @@ -117,7 +117,10 @@ def create_table( finally: with Finally(f"drop the table {table_name}"): - node.query(f"DROP TABLE IF EXISTS {table_name}") + query = f"DROP TABLE IF EXISTS {table_name}" + if cluster: + query += f" ON CLUSTER {cluster}" + node.query(query) return query diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py index ff7fef1e3..05ecd0310 100644 --- a/s3/tests/export_part/clusters_and_nodes.py +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -5,101 +5,53 @@ from testflows.asserts import error from s3.tests.export_part.steps import * +from alter.table.replace_partition.common import create_partitions_with_random_uint64 @TestScenario def different_nodes_same_destination(self, cluster, node1, node2): """Test export part from different nodes to same S3 destination in a given cluster.""" - with Given("I create tables on different nodes"): - source1, shared_destination = create_source_and_destination_tables(cluster=cluster, node=node1) - source2, _ = create_source_and_destination_tables(cluster=cluster, node=node2) - - with When("I insert test data into the source tables"): - source1.insert_test_data(random=random.Random(1), node=node1) - source2.insert_test_data(random=random.Random(2), node=node2) - - with And("I export parts from both nodes"): - parts1 = source1.get_parts(node=node1) - parts2 = source2.get_parts(node=node2) - events_before_node1 = export_events(node=node1) - events_before_node2 = export_events(node=node2) - export_part(parts=parts1, source=source1, destination=shared_destination) - export_part(parts=parts2, source=source2, destination=shared_destination) - - with Then("I check system.events that all exports are successful"): - events_after_node1 = export_events(node=node1) - events_after_node2 = export_events(node=node2) - total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) - total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) - assert total_exports_after == total_exports_before + len(parts1) + len(parts2), error() - - with And("I verify data from both nodes appear in S3"): - destination_data = shared_destination.select_ordered_by_partition_and_index() - for part in parts1: - assert part.split("_")[0] in destination_data, error() - for part in parts2: - assert part.split("_")[0] in destination_data, error() - - -@TestScenario -def different_nodes_different_destinations(self, cluster, node1, node2): - """Test export part from different nodes to different S3 destinations.""" - - with Given("I create tables on different nodes with same part names"): - source1, destination1 = create_source_and_destination_tables(cluster=cluster, node=node1) - source2, destination2 = create_source_and_destination_tables(cluster=cluster, node=node2) - - with When("I insert test data into the source tables"): - source1.insert_test_data(random=random.Random(1)) - source2.insert_test_data(random=random.Random(2)) - - with And("I export parts from both nodes to separate destinations"): - parts1 = source1.get_parts() - parts2 = source2.get_parts() - events_before = export_events() - export_part(parts=parts1, source=source1, destination=destination1) - export_part(parts=parts2, source=source2, destination=destination2) - - with Then("I check system.events that all exports are successful"): - events_after = export_events() - total_exports_after = events_after.get("PartsExports", 0) + events_after.get("PartsExportDuplicated", 0) - total_exports_before = events_before.get("PartsExports", 0) + events_before.get("PartsExportDuplicated", 0) - assert total_exports_after == total_exports_before + len(parts1) + len(parts2), error() + with Given("I create an empty source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=self.context.default_columns, + stop_merges=True, + populate=False, + cluster=cluster, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True, cluster=cluster) + + with And("I populate the source tables on both nodes"): + create_partitions_with_random_uint64(table_name="source", node=node1) + create_partitions_with_random_uint64(table_name="source", node=node2) - with And("I verify data from both nodes appear in separate destinations"): - data1 = destination1.select_ordered_by_partition_and_index() - data2 = destination2.select_ordered_by_partition_and_index() - - with By("Checking data from both nodes appear in the right destinations"): - for part in parts1: - assert part.split("_")[0] in data1, error() - for part in parts2: - assert part.split("_")[0] in data2, error() - - with And("Checking data from both nodes do not appear in the wrong destinations"): - unique_parts1 = list(set(parts1) - set(parts2)) - unique_parts2 = list(set(parts2) - set(parts1)) - for part in unique_parts1: - assert part.split("_")[0] not in data2, error() - for part in unique_parts2: - assert part.split("_")[0] not in data1, error() + with When("I export parts to the S3 table from both nodes"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=node1, + ) + export_parts( + source_table="source", + destination_table=s3_table_name, + node=node2, + ) + with And("I read data from all tables on both nodes"): + source_data1 = select_all_ordered(table_name="source", node=node1) + source_data2 = select_all_ordered(table_name="source", node=node2) + destination_data1 = select_all_ordered( + table_name=s3_table_name, node=node1 + ) + destination_data2 = select_all_ordered( + table_name=s3_table_name, node=node2 + ) -# I need to get the nodes from a cluster; is this the right way to do it? -def get_cluster_nodes(cluster, node=None): - """Get all nodes in a cluster.""" - - if node is None: - node = current().context.node - - result = node.query( - f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", - exitcode=0 - ) - - nodes = [line.strip() for line in result.output.splitlines() if line.strip()] - return nodes + with Then("Destination data should be comprised of data from both sources, and identical on both nodes"): + assert set(destination_data1) == set(source_data1) | set(source_data2), error() + assert set(destination_data2) == set(source_data1) | set(source_data2), error() @TestFeature @@ -108,21 +60,21 @@ def feature(self): """Check functionality of exporting data parts to S3 storage from different clusters and nodes.""" clusters = [ - # "sharded_cluster", - # "replicated_cluster", - # "one_shard_cluster", - # "sharded_cluster12", - # "one_shard_cluster12", + "sharded_cluster", + "replicated_cluster", + "one_shard_cluster", + "sharded_cluster12", + "one_shard_cluster12", "sharded_cluster23", - # "one_shard_cluster23", + "one_shard_cluster23", ] for cluster in clusters: - node_names = get_cluster_nodes(cluster=cluster) + with Given(f"I get nodes for cluster {cluster}"): + node_names = get_cluster_nodes(cluster=cluster) for node1_name, node2_name in combinations(node_names, 2): node1 = self.context.cluster.node(node1_name) node2 = self.context.cluster.node(node2_name) - note(f"Testing {cluster} with nodes {node1_name} and {node2_name}") - # different_nodes_same_destination(cluster=cluster, node1=node1, node2=node2) - # different_nodes_different_destinations(cluster=cluster, node1=node1, node2=node2) \ No newline at end of file + # with When(f"I export parts from nodes {node1_name} and {node2_name} to S3 in cluster {cluster}"): + different_nodes_same_destination(cluster=cluster, node1=node1, node2=node2) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 10dca1849..6e25c725d 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -16,4 +16,4 @@ def minio(self, uri, bucket_prefix): Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) \ No newline at end of file + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 54729cecc..746aada36 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -57,6 +57,22 @@ def create_s3_table( return table_name +@TestStep(Given) +def get_cluster_nodes(self, cluster, node=None): + """Get all nodes in a cluster.""" + + if node is None: + node = self.context.node + + result = node.query( + f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", + exitcode=0 + ) + + nodes = [line.strip() for line in result.output.splitlines() if line.strip()] + return nodes + + @TestStep(When) def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" From e5596729eb83e9c6971bfcd25ffcc417f60e28be Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 27 Oct 2025 13:01:38 -0400 Subject: [PATCH 24/99] Move engine tests, move default_columns function --- s3/tests/export_part/clusters_and_nodes.py | 6 +- s3/tests/export_part/engines.py | 71 ++++++++++++++++++++ s3/tests/export_part/feature.py | 7 +- s3/tests/export_part/sanity.py | 78 +--------------------- s3/tests/export_part/steps.py | 17 ++++- 5 files changed, 91 insertions(+), 88 deletions(-) create mode 100644 s3/tests/export_part/engines.py diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py index 05ecd0310..de379478b 100644 --- a/s3/tests/export_part/clusters_and_nodes.py +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -1,9 +1,8 @@ import random -from itertools import combinations +from itertools import combinations from testflows.core import * from testflows.asserts import error - from s3.tests.export_part.steps import * from alter.table.replace_partition.common import create_partitions_with_random_uint64 @@ -16,7 +15,7 @@ def different_nodes_same_destination(self, cluster, node1, node2): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=self.context.default_columns, + columns=default_columns(), stop_merges=True, populate=False, cluster=cluster, @@ -76,5 +75,4 @@ def feature(self): for node1_name, node2_name in combinations(node_names, 2): node1 = self.context.cluster.node(node1_name) node2 = self.context.cluster.node(node2_name) - # with When(f"I export parts from nodes {node1_name} and {node2_name} to S3 in cluster {cluster}"): different_nodes_same_destination(cluster=cluster, node1=node1, node2=node2) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py new file mode 100644 index 000000000..6086c8acb --- /dev/null +++ b/s3/tests/export_part/engines.py @@ -0,0 +1,71 @@ +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * + + +@TestScenario +def configured_table(self, table_engine, number_of_partitions, number_of_parts): + """Test a specific combination of table engine, number of partitions, and number of parts.""" + + with Given("I create a populated source table and empty S3 table"): + table_engine( + table_name="source", + partition_by="p", + stop_merges=True, + populate=True, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + columns=default_columns(), + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, columns=default_columns() + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("They should be the same"): + assert source_data == destination_data, error() + + +@TestSketch(Scenario) +@Flags(TE) +def table_combos(self): + """Test various combinations of table engines, number of partitions, and number of parts.""" + + tables = [ + partitioned_merge_tree_table, + partitioned_replacing_merge_tree_table, + partitioned_summing_merge_tree_table, + partitioned_collapsing_merge_tree_table, + partitioned_versioned_collapsing_merge_tree_table, + partitioned_aggregating_merge_tree_table, + partitioned_graphite_merge_tree_table, + ] + # TODO expand combos + number_of_partitions = [5] + number_of_parts = [1] + + configured_table( + table_engine=either(*tables), + number_of_partitions=either(*number_of_partitions), + number_of_parts=either(*number_of_parts), + ) + + +@TestFeature +@Name("engines") +def feature(self): + """Check exporting parts to S3 storage with different table engines.""" + + Scenario(run=table_combos) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 6e25c725d..e674c4b6b 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -8,12 +8,9 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - self.context.default_columns = [ - {"name": "p", "type": "Int8"}, - {"name": "i", "type": "UInt64"}, - ] Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) \ No newline at end of file + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) \ No newline at end of file diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index fb80b3952..a710ea936 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -1,6 +1,5 @@ from testflows.core import * from testflows.asserts import error - from s3.tests.export_part.steps import * from helpers.create import * @@ -8,79 +7,6 @@ # TODO large data export? or maybe that should be in a different file -def columns(): - partition_columns = [ - {"name": "p", "type": "Int8"}, - {"name": "i", "type": "UInt64"}, - {"name": "Path", "type": "String"}, - {"name": "Time", "type": "DateTime"}, - {"name": "Value", "type": "Float64"}, - {"name": "Timestamp", "type": "Int64"}, - ] - - return partition_columns - - -@TestScenario -def configured_table(self, table_engine, number_of_partitions, number_of_parts): - """Test a specific combination of table engine, number of partitions, and number of parts.""" - - with Given("I create a populated source table and empty S3 table"): - table_engine( - table_name="source", - partition_by="p", - stop_merges=True, - populate=True, - number_of_partitions=number_of_partitions, - number_of_parts=number_of_parts, - columns=columns(), - ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, columns=columns() - ) - - with When("I export parts to the S3 table"): - export_parts( - source_table="source", - destination_table=s3_table_name, - node=self.context.node, - ) - - with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node - ) - - with Then("They should be the same"): - assert source_data == destination_data, error() - - -@TestSketch(Scenario) -@Flags(TE) -def table_combos(self): - """Test various combinations of table engines, number of partitions, and number of parts.""" - - tables = [ - partitioned_merge_tree_table, - partitioned_replacing_merge_tree_table, - partitioned_summing_merge_tree_table, - partitioned_collapsing_merge_tree_table, - partitioned_versioned_collapsing_merge_tree_table, - partitioned_aggregating_merge_tree_table, - partitioned_graphite_merge_tree_table, - ] - # TODO expand combos - number_of_partitions = [5] - number_of_parts = [1] - - configured_table( - table_engine=either(*tables), - number_of_partitions=either(*number_of_partitions), - number_of_parts=either(*number_of_parts), - ) - - @TestScenario def basic_table(self): """Test exporting parts of a basic table.""" @@ -89,7 +15,7 @@ def basic_table(self): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=self.context.default_columns, + columns=default_columns(), stop_merges=True, populate=True, ) @@ -120,7 +46,7 @@ def empty_table(self): partitioned_merge_tree_table( table_name="empty_source", partition_by="p", - columns=self.context.default_columns, + columns=default_columns(), stop_merges=True, populate=False, ) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 746aada36..8b1c7d5d8 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -7,6 +7,19 @@ from s3.tests.common import temporary_bucket_path +def default_columns(): + partition_columns = [ + {"name": "p", "type": "Int8"}, + {"name": "i", "type": "UInt64"}, + {"name": "Path", "type": "String"}, + {"name": "Time", "type": "DateTime"}, + {"name": "Value", "type": "Float64"}, + {"name": "Timestamp", "type": "Int64"}, + ] + + return partition_columns + + @TestStep(Given) def create_temp_bucket(self): """Create temporary S3 bucket.""" @@ -16,8 +29,6 @@ def create_temp_bucket(self): ) self.context.uri = f"{self.context.uri_base}export_part/{temp_s3_path}/" - # Delete the next line if the context var is never used - # self.context.bucket_path = f"{self.context.bucket_prefix}/export_part/{temp_s3_path}" @TestStep(Given) @@ -30,7 +41,7 @@ def create_s3_table( create_temp_bucket() if columns is None: - columns = self.context.default_columns + columns = default_columns() table_name = f"{table_name}_{getuid()}" engine = f""" From 9ad790d86bb2e720319529f9bc16ceb077cffd6d Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 27 Oct 2025 13:05:20 -0400 Subject: [PATCH 25/99] Sanity fix --- s3/tests/export_part/sanity.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index a710ea936..2ee07ffb5 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -78,5 +78,4 @@ def feature(self): """Check basic functionality of exporting data parts to S3 storage.""" Scenario(run=empty_table) - Scenario(run=basic_table) - Scenario(run=table_combos) + Scenario(run=basic_table) \ No newline at end of file From 5312d752b77bce0bb00418f9b5dd5ee1703df0a5 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 27 Oct 2025 13:14:45 -0400 Subject: [PATCH 26/99] black everything --- helpers/create.py | 171 +++++++++++++++++---- s3/tests/export_part/clusters_and_nodes.py | 24 +-- s3/tests/export_part/engines.py | 2 +- s3/tests/export_part/error_handling.py | 15 +- s3/tests/export_part/feature.py | 8 +- s3/tests/export_part/sanity.py | 13 +- s3/tests/export_part/steps.py | 18 ++- s3/tests/export_part/system_monitoring.py | 9 +- 8 files changed, 198 insertions(+), 62 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index 7fa1d4c77..b4cfe605c 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -1,7 +1,9 @@ from testflows.core import * -from alter.table.replace_partition.common import create_partitions_with_random_uint64, \ - create_partitions_for_collapsing_merge_tree +from alter.table.replace_partition.common import ( + create_partitions_with_random_uint64, + create_partitions_for_collapsing_merge_tree, +) @TestStep(Given) @@ -64,7 +66,7 @@ def create_table( query += f"{table_name}" else: query += f" {table_name}" - + if cluster: query += f" ON CLUSTER {cluster}" @@ -111,7 +113,7 @@ def create_table( if stop_merges: query += f" SYSTEM STOP MERGES {table_name};" - + node.query(query) yield @@ -391,92 +393,179 @@ def create_replicated_merge_tree_table( @TestStep(Given) def partitioned_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a MergeTree table partitioned by a specific column.""" with By(f"creating a partitioned {table_name} table with a MergeTree engine"): create_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) @TestStep(Given) def partitioned_replicated_merge_tree_table( - self, table_name, partition_by, columns=None, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns=None, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a ReplicatedMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a ReplicatedMergeTree engine" ): create_replicated_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) @TestStep(Given) def partitioned_replacing_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a ReplacingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a ReplacingMergeTree engine" ): create_replacing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) @TestStep(Given) def partitioned_summing_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a SummingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a SummingMergeTree engine" ): create_aggregating_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) @TestStep(Given) def partitioned_collapsing_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=1, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=1, + number_of_parts=1, ): """Create a CollapsingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a CollapsingMergeTree engine" ): create_collapsing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, sign="p", cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + sign="p", + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): create_partitions_for_collapsing_merge_tree( - table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, ) @TestStep(Given) def partitioned_versioned_collapsing_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=1, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=1, + number_of_parts=1, ): """Create a VersionedCollapsingMergeTree table partitioned by a specific column.""" with By( @@ -495,30 +584,56 @@ def partitioned_versioned_collapsing_merge_tree_table( if populate: with And("populating it with the data needed to create multiple partitions"): create_partitions_with_random_uint64( - table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, ) @TestStep(Given) def partitioned_aggregating_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a AggregatingMergeTree table partitioned by a specific column.""" with By( f"creating a partitioned {table_name} table with a AggregatingMergeTree engine" ): create_summing_merge_tree_table( - table_name=table_name, columns=columns, partition_by=partition_by, cluster=cluster, stop_merges=stop_merges + table_name=table_name, + columns=columns, + partition_by=partition_by, + cluster=cluster, + stop_merges=stop_merges, ) if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) @TestStep(Given) def partitioned_graphite_merge_tree_table( - self, table_name, partition_by, columns, cluster=None, stop_merges=False, populate=True, number_of_partitions=5, number_of_parts=1 + self, + table_name, + partition_by, + columns, + cluster=None, + stop_merges=False, + populate=True, + number_of_partitions=5, + number_of_parts=1, ): """Create a GraphiteMergeTree table partitioned by a specific column.""" with By( @@ -535,4 +650,8 @@ def partitioned_graphite_merge_tree_table( if populate: with And("populating it with the data needed to create multiple partitions"): - create_partitions_with_random_uint64(table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts) + create_partitions_with_random_uint64( + table_name=table_name, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py index de379478b..cba836011 100644 --- a/s3/tests/export_part/clusters_and_nodes.py +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -10,7 +10,7 @@ @TestScenario def different_nodes_same_destination(self, cluster, node1, node2): """Test export part from different nodes to same S3 destination in a given cluster.""" - + with Given("I create an empty source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -20,8 +20,10 @@ def different_nodes_same_destination(self, cluster, node1, node2): populate=False, cluster=cluster, ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True, cluster=cluster) - + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, cluster=cluster + ) + with And("I populate the source tables on both nodes"): create_partitions_with_random_uint64(table_name="source", node=node1) create_partitions_with_random_uint64(table_name="source", node=node2) @@ -41,14 +43,12 @@ def different_nodes_same_destination(self, cluster, node1, node2): with And("I read data from all tables on both nodes"): source_data1 = select_all_ordered(table_name="source", node=node1) source_data2 = select_all_ordered(table_name="source", node=node2) - destination_data1 = select_all_ordered( - table_name=s3_table_name, node=node1 - ) - destination_data2 = select_all_ordered( - table_name=s3_table_name, node=node2 - ) + destination_data1 = select_all_ordered(table_name=s3_table_name, node=node1) + destination_data2 = select_all_ordered(table_name=s3_table_name, node=node2) - with Then("Destination data should be comprised of data from both sources, and identical on both nodes"): + with Then( + "Destination data should be comprised of data from both sources, and identical on both nodes" + ): assert set(destination_data1) == set(source_data1) | set(source_data2), error() assert set(destination_data2) == set(source_data1) | set(source_data2), error() @@ -57,7 +57,7 @@ def different_nodes_same_destination(self, cluster, node1, node2): @Name("clusters and nodes") def feature(self): """Check functionality of exporting data parts to S3 storage from different clusters and nodes.""" - + clusters = [ "sharded_cluster", "replicated_cluster", @@ -71,7 +71,7 @@ def feature(self): for cluster in clusters: with Given(f"I get nodes for cluster {cluster}"): node_names = get_cluster_nodes(cluster=cluster) - + for node1_name, node2_name in combinations(node_names, 2): node1 = self.context.cluster.node(node1_name) node2 = self.context.cluster.node(node2_name) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 6086c8acb..731f52929 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -68,4 +68,4 @@ def table_combos(self): def feature(self): """Check exporting parts to S3 storage with different table engines.""" - Scenario(run=table_combos) \ No newline at end of file + Scenario(run=table_combos) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index db09d0d7a..70f7ebcf1 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -13,17 +13,24 @@ def invalid_part_name(self): source, destination = create_source_and_destination_tables() with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + source.insert_test_data() # default row_count=10, cardinality=1 with And("I create an invalid part name"): invalid_part_name = "in_va_lid_part" with Then("I try to export the invalid part and expect an error"): - results = export_part(parts=[invalid_part_name], source=source, destination=destination, exitcode=1) + results = export_part( + parts=[invalid_part_name], + source=source, + destination=destination, + exitcode=1, + ) assert len(results) == 1, error() # note(f"Result: {results[0].output}") assert results[0].exitcode == 233, error() - assert f"Unexpected part name: {invalid_part_name}" in results[0].output, error() + assert ( + f"Unexpected part name: {invalid_part_name}" in results[0].output + ), error() @TestFeature @@ -31,4 +38,4 @@ def invalid_part_name(self): def feature(self): """Check correct error handling when exporting parts.""" - Scenario(run=invalid_part_name) \ No newline at end of file + Scenario(run=invalid_part_name) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index e674c4b6b..3659af444 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -9,8 +9,8 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) \ No newline at end of file + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 2ee07ffb5..85e8155a8 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -7,6 +7,13 @@ # TODO large data export? or maybe that should be in a different file +@TestScenario +def mismatched_columns(self): + """Test exporting parts when source and destination tables have mismatched columns.""" + + # with Given() + + @TestScenario def basic_table(self): """Test exporting parts of a basic table.""" @@ -15,7 +22,7 @@ def basic_table(self): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(), + columns=default_columns(simple=True), stop_merges=True, populate=True, ) @@ -46,7 +53,7 @@ def empty_table(self): partitioned_merge_tree_table( table_name="empty_source", partition_by="p", - columns=default_columns(), + columns=default_columns(simple=True), stop_merges=True, populate=False, ) @@ -78,4 +85,4 @@ def feature(self): """Check basic functionality of exporting data parts to S3 storage.""" Scenario(run=empty_table) - Scenario(run=basic_table) \ No newline at end of file + Scenario(run=basic_table) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 8b1c7d5d8..b081abf1c 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -7,8 +7,8 @@ from s3.tests.common import temporary_bucket_path -def default_columns(): - partition_columns = [ +def default_columns(simple=False): + columns = [ {"name": "p", "type": "Int8"}, {"name": "i", "type": "UInt64"}, {"name": "Path", "type": "String"}, @@ -17,7 +17,10 @@ def default_columns(): {"name": "Timestamp", "type": "Int64"}, ] - return partition_columns + if simple: + columns = columns[:2] + + return columns @TestStep(Given) @@ -71,15 +74,14 @@ def create_s3_table( @TestStep(Given) def get_cluster_nodes(self, cluster, node=None): """Get all nodes in a cluster.""" - + if node is None: node = self.context.node - + result = node.query( - f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", - exitcode=0 + f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", exitcode=0 ) - + nodes = [line.strip() for line in result.output.splitlines() if line.strip()] return nodes diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index c089f5be8..81912e40a 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -6,6 +6,7 @@ # TODO checks on export_events should go here, not in sanity.py + @TestScenario def duplicate_exports(self): """Check duplicate export attempts are properly tracked in system.events.""" @@ -14,7 +15,7 @@ def duplicate_exports(self): source, destination = create_source_and_destination_tables() with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + source.insert_test_data() # default row_count=10, cardinality=1 with And("I get a list of parts for source table"): source_parts = source.get_parts() @@ -33,10 +34,10 @@ def duplicate_exports(self): events_final = export_events() final_exports = events_final.get("PartsExports", 0) final_duplicates = events_final.get("PartsExportDuplicated", 0) - + with By("Checking we have 1 successful export"): assert final_exports - initial_exports == 1, error() - + with And("Checking we have 1 duplicate export"): assert final_duplicates - initial_duplicates == 1, error() @@ -46,4 +47,4 @@ def duplicate_exports(self): def feature(self): """Check system monitoring of export events.""" - Scenario(run=duplicate_exports) \ No newline at end of file + Scenario(run=duplicate_exports) From a7daf8b4ef6befbd07cbc38140fc53ef1d9d2a67 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 27 Oct 2025 14:09:46 -0400 Subject: [PATCH 27/99] fixes --- s3/tests/export_part/feature.py | 8 ++++---- s3/tests/export_part/sanity.py | 8 ++++++-- s3/tests/export_part/steps.py | 9 +++++++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 3659af444..5060b55fd 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -9,8 +9,8 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines", "feature")) + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 85e8155a8..28c24005f 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -26,7 +26,9 @@ def basic_table(self): stop_merges=True, populate=True, ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, simple_columns=True + ) with When("I export parts to the S3 table"): export_parts( @@ -57,7 +59,9 @@ def empty_table(self): stop_merges=True, populate=False, ) - s3_table_name = create_s3_table(table_name="empty_s3", create_new_bucket=True) + s3_table_name = create_s3_table( + table_name="empty_s3", create_new_bucket=True, simple_columns=True + ) with When("I export parts to the S3 table"): export_parts( diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index b081abf1c..5b0667005 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -36,7 +36,12 @@ def create_temp_bucket(self): @TestStep(Given) def create_s3_table( - self, table_name, cluster=None, create_new_bucket=False, columns=None + self, + table_name, + cluster=None, + create_new_bucket=False, + columns=None, + simple_columns=False, ): """Create a destination S3 table.""" @@ -44,7 +49,7 @@ def create_s3_table( create_temp_bucket() if columns is None: - columns = default_columns() + columns = default_columns(simple=simple_columns) table_name = f"{table_name}_{getuid()}" engine = f""" From fcfb5115a82dd66afb9208a7cbfd355925c484d7 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 27 Oct 2025 15:38:15 -0400 Subject: [PATCH 28/99] sanity, add mismatched_columns test --- s3/tests/export_part/feature.py | 7 +++++-- s3/tests/export_part/sanity.py | 30 +++++++++++++++++++++++++----- s3/tests/export_part/steps.py | 8 ++++++-- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 5060b55fd..93b231fa5 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -1,6 +1,9 @@ from testflows.core import * +# TODO large data export? which file should it go in? + + @TestFeature @Name("export parts") def minio(self, uri, bucket_prefix): @@ -12,5 +15,5 @@ def minio(self, uri, bucket_prefix): Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 28c24005f..3cd4a40f7 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -4,14 +4,33 @@ from helpers.create import * -# TODO large data export? or maybe that should be in a different file - - @TestScenario def mismatched_columns(self): """Test exporting parts when source and destination tables have mismatched columns.""" - # with Given() + with Given("I create a source table and S3 table with different columns"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(simple=True), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, simple_columns=False + ) + + with When("I export parts to the S3 table"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to mismatched columns"): + assert results[0].exitcode == 122, error() + assert "Tables have different structure" in results[0].output, error() @TestScenario @@ -56,7 +75,7 @@ def empty_table(self): table_name="empty_source", partition_by="p", columns=default_columns(simple=True), - stop_merges=True, + stop_merges=False, populate=False, ) s3_table_name = create_s3_table( @@ -90,3 +109,4 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) + Scenario(run=mismatched_columns) \ No newline at end of file diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 5b0667005..26cf909a5 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -116,13 +116,17 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco parts = get_parts(table_name=source_table, node=node) no_checks = exitcode != 0 + output = [] + for part in parts: - node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug + output.append(node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", # settings=[("allow_experimental_export_merge_tree_part", 1)], exitcode=exitcode, no_checks=no_checks, - ) + )) + + return output # TODO find the simplest way to parse the output From c1b61d94c4472c3f9af909e190badd681704e8d6 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 27 Oct 2025 15:46:40 -0400 Subject: [PATCH 29/99] invalid_part_name rewrite, plus black --- s3/tests/export_part/error_handling.py | 30 +++++++++++++++----------- s3/tests/export_part/feature.py | 4 ++-- s3/tests/export_part/sanity.py | 2 +- s3/tests/export_part/steps.py | 16 ++++++++------ 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 70f7ebcf1..1bccc5a81 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -1,6 +1,5 @@ from testflows.core import * from testflows.asserts import error - from s3.tests.export_part.steps import * from helpers.tables import * @@ -9,24 +8,31 @@ def invalid_part_name(self): """Check that exporting a non-existent part returns the correct error.""" - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(simple=True), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, simple_columns=True + ) with And("I create an invalid part name"): invalid_part_name = "in_va_lid_part" - with Then("I try to export the invalid part and expect an error"): - results = export_part( + with When("I try to export the invalid part"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, parts=[invalid_part_name], - source=source, - destination=destination, exitcode=1, ) - assert len(results) == 1, error() - # note(f"Result: {results[0].output}") + + with Then("I should see an error related to the invalid part name"): assert results[0].exitcode == 233, error() assert ( f"Unexpected part name: {invalid_part_name}" in results[0].output diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 93b231fa5..5ef9a9c96 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,8 +12,8 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 3cd4a40f7..677245803 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -109,4 +109,4 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) - Scenario(run=mismatched_columns) \ No newline at end of file + Scenario(run=mismatched_columns) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 26cf909a5..4fb47aec4 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -119,13 +119,15 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco output = [] for part in parts: - output.append(node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - # settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=exitcode, - no_checks=no_checks, - )) - + output.append( + node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + # settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=exitcode, + no_checks=no_checks, + ) + ) + return output From 3950d7d1a60c67f5e814f2e16e54367db8bd80b2 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 27 Oct 2025 17:29:53 -0400 Subject: [PATCH 30/99] working tests --- s3/tests/export_part/feature.py | 6 +++--- s3/tests/export_part/system_monitoring.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 5ef9a9c96..59a1210fc 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,8 +12,8 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines", "feature")) + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 81912e40a..0ff2d3712 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -5,7 +5,8 @@ # TODO checks on export_events should go here, not in sanity.py - +# partsexports incrementing correctly +# duplicates incrementing correctly @TestScenario def duplicate_exports(self): From 14b535cdb4c6959a62a9f7e867f4152f3ce3729a Mon Sep 17 00:00:00 2001 From: Selfeer Date: Tue, 28 Oct 2025 17:21:49 +0400 Subject: [PATCH 31/99] update requirements --- s3/requirements/export_part.md | 8 +- s3/requirements/export_part.py | 1207 ++++++++++++++++++++++++++++++++ 2 files changed, 1211 insertions(+), 4 deletions(-) create mode 100644 s3/requirements/export_part.py diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index babdbb19c..6891a2a47 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -11,7 +11,7 @@ * 4 [Supported source table engines](#supported-source-table-engines) * 4.1 [RQ.ClickHouse.ExportPart.SourceEngines](#rqclickhouseexportpartsourceengines) * 5 [Supported source part storage types](#supported-source-part-storage-types) - * 5.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartSourcepartstorage) + * 5.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) * 6 [Supported destination table engines](#supported-destination-table-engines) * 6.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) * 7 [Destination setup and file management](#destination-setup-and-file-management) @@ -24,8 +24,8 @@ * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) * 11 [Part types and content support](#part-types-and-content-support) * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) -* 12 [Export operation failure recovery](#export-operation-failure-recovery) - * 12.1 [RQ.ClickHouse.ExportPart.FailureRecovery](#rqclickhouseexportpartfailurerecovery) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) * 13 [Export operation restrictions](#export-operation-restrictions) * 13.1 [Preventing same table exports](#preventing-same-table-exports) * 13.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) @@ -437,4 +437,4 @@ version: 1.0 * **Credential Management**: Export operations must use secure credential storage and avoid exposing credentials in logs -[ClickHouse]: https://clickhouse.com \ No newline at end of file +[ClickHouse]: https://clickhouse.com diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py new file mode 100644 index 000000000..2b49a87e0 --- /dev/null +++ b/s3/requirements/export_part.py @@ -0,0 +1,1207 @@ +# These requirements were auto generated +# from software requirements specification (SRS) +# document by TestFlows v2.0.250110.1002922. +# Do not edit by hand but re-generate instead +# using 'tfs requirements generate' command. +from testflows.core import Specification +from testflows.core import Requirement + +Heading = Specification.Heading + +RQ_ClickHouse_ExportPart_S3 = Requirement( + name="RQ.ClickHouse.ExportPart.S3", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting data parts from MergeTree engine tables to S3 object storage.\n" + "\n" + ), + link=None, + level=2, + num="2.1", +) + +RQ_ClickHouse_ExportPart_SQLCommand = Requirement( + name="RQ.ClickHouse.ExportPart.SQLCommand", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the following SQL command syntax for exporting MergeTree data parts to object storage tables:\n" + "\n" + "```sql\n" + "ALTER TABLE [database.]source_table_name \n" + "EXPORT PART 'part_name' \n" + "TO TABLE [database.]destination_table_name\n" + "```\n" + "\n" + "**Parameters:**\n" + "- `source_table_name`: Name of the source MergeTree table\n" + "- `part_name`: Name of the specific part to export (string literal)\n" + "- `destination_table_name`: Name of the destination object storage table\n" + "\n" + ), + link=None, + level=2, + num="3.1", +) + +RQ_ClickHouse_ExportPart_SourceEngines = Requirement( + name="RQ.ClickHouse.ExportPart.SourceEngines", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting from the following source table engines:\n" + "* `MergeTree` - Base MergeTree engine\n" + "* `ReplicatedMergeTree` - Replicated MergeTree engine with ZooKeeper coordination\n" + "* `SummingMergeTree` - MergeTree with automatic summation of numeric columns\n" + "* `AggregatingMergeTree` - MergeTree with pre-aggregated data\n" + "* `CollapsingMergeTree` - MergeTree with row versioning for updates\n" + "* `VersionedCollapsingMergeTree` - CollapsingMergeTree with version tracking\n" + "* `GraphiteMergeTree` - MergeTree optimized for Graphite data\n" + "* All other MergeTree family engines that inherit from `MergeTreeData`\n" + "\n" + ), + link=None, + level=2, + num="4.1", +) + +RQ_ClickHouse_ExportPart_SourcePartStorage = Requirement( + name="RQ.ClickHouse.ExportPart.SourcePartStorage", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting data parts regardless of the underlying storage type where the source parts are stored, including:\n" + "* **Local Disks**: Parts stored on local filesystem\n" + "* **S3/Object Storage**: Parts stored on S3 or S3-compatible object storage\n" + "* **Encrypted Disks**: Parts stored on encrypted disks (disk-level encryption)\n" + "* **Cached Disks**: Parts stored with filesystem cache enabled\n" + "* **Remote Disks**: Parts stored on HDFS, Azure Blob Storage, or Google Cloud Storage\n" + "* **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold)\n" + "* **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled\n" + "\n" + ), + link=None, + level=2, + num="5.1", +) + +RQ_ClickHouse_ExportPart_DestinationEngines = Requirement( + name="RQ.ClickHouse.ExportPart.DestinationEngines", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting to destination tables that:\n" + "* Support object storage engines including:\n" + " * `S3` - Amazon S3 and S3-compatible storage\n" + " * `StorageObjectStorage` - Generic object storage interface\n" + " * `HDFS` - Hadoop Distributed File System (with Hive partitioning)\n" + " * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning)\n" + " * `GCS` - Google Cloud Storage (with Hive partitioning)\n" + "* Implement the `supportsImport()` method and return `true`\n" + "\n" + ), + link=None, + level=2, + num="6.1", +) + +RQ_ClickHouse_ExportPart_DestinationSetup = Requirement( + name="RQ.ClickHouse.ExportPart.DestinationSetup", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle destination setup and file management by:\n" + "* Creating appropriate import sinks for destination storage systems\n" + "* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts\n" + "* Allowing destination storage to determine the final file path based on Hive partitioning\n" + "* Creating files in the destination storage that users can observe and access\n" + "* Providing the final destination file path in the `system.exports` table for monitoring\n" + "\n" + ), + link=None, + level=2, + num="7.1", +) + +RQ_ClickHouse_ExportPart_DataPreparation = Requirement( + name="RQ.ClickHouse.ExportPart.DataPreparation", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prepare data for export by:\n" + "* Automatically selecting all physical columns from source table metadata\n" + "* Extracting partition key values for proper Hive partitioning in destination\n" + "\n" + ), + link=None, + level=2, + num="8.1", +) + +RQ_ClickHouse_ExportPart_SchemaCompatibility = Requirement( + name="RQ.ClickHouse.ExportPart.SchemaCompatibility", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL require source and destination tables to have compatible schemas for successful export operations:\n" + "* Identical physical column schemas between source and destination\n" + "* The same partition key expression in both tables\n" + "* Compatible data types for all columns\n" + "* Matching column order and names\n" + "\n" + ), + link=None, + level=2, + num="9.1", +) + +RQ_ClickHouse_ExportPart_PartitionKeyTypes = Requirement( + name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support export operations for tables with partition key types that are compatible with Hive partitioning, as shown in the following table:\n" + "\n" + "| Partition Key Type | Supported | Examples | Notes |\n" + "|-------------------|------------|----------|-------|\n" + "| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported |\n" + "| **Date/DateTime Types** | ✅ Yes | `Date`, `DateTime`, `DateTime64` | All date/time types supported |\n" + "| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported |\n" + "| **Date Functions** | ✅ Yes | `toYYYYMM(date_col)`, `toMonday(date_col)`, `toYear(date_col)` | Result in supported types |\n" + "| **Mathematical Expressions** | ✅ Yes | `column1 + column2`, `column * 1000` | If result is supported type |\n" + "| **String Functions** | ✅ Yes | `substring(column, 1, 4)` | Result in String type |\n" + "| **Tuple Expressions** | ✅ Yes | `(toMonday(StartDate), EventType)` | If all elements are supported types |\n" + "| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported |\n" + "| **UUID Types** | ❌ No | `UUID` | Not supported by Hive partitioning |\n" + "| **Enum Types** | ❌ No | `Enum8`, `Enum16` | Not supported by Hive partitioning |\n" + "| **Floating-point Types** | ❌ No | `Float32`, `Float64` | Not supported by Hive partitioning |\n" + "| **Hash Functions** | ❌ No | `intHash32(column)`, `cityHash64(column)` | Result in unsupported types |\n" + "\n" + "[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements.\n" + "\n" + "[ClickHouse] SHALL require destination tables to support Hive partitioning, which limits the supported partition key types to Integer, Date/DateTime, and String types. Complex expressions that result in unsupported types are not supported for export operations.\n" + "\n" + ), + link=None, + level=2, + num="10.1", +) + +RQ_ClickHouse_ExportPart_PartTypes = Requirement( + name="RQ.ClickHouse.ExportPart.PartTypes", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support export operations for all valid MergeTree part types and their contents, including:\n" + "\n" + "| Part Type | Supported | Description | Special Features |\n" + "|-----------|------------|-------------|------------------|\n" + "| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts |\n" + "| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts |\n" + "| **Regular Parts** | ✅ Yes | Standard data parts created by inserts, merges, mutations | Full data content |\n" + "| **Patch Parts** | ✅ Yes | Lightweight update parts containing only changed columns | Applied during export |\n" + "| **Active Parts** | ✅ Yes | Currently active data parts | Primary export target |\n" + "| **Outdated Parts** | ✅ Yes | Parts that have been replaced by newer versions | Can be exported for backup |\n" + "\n" + "[ClickHouse] SHALL handle all special columns and metadata present in parts during export:\n" + "\n" + "| Column Type | Supported | Description | Export Behavior |\n" + "|-------------|------------|-------------|-----------------|\n" + "| **Physical Columns** | ✅ Yes | User-defined table columns | All physical columns exported |\n" + "| **RowExistsColumn (_row_exists)** | ✅ Yes | Lightweight delete mask showing row existence | Exported to maintain delete state |\n" + "| **BlockNumberColumn (_block_number)** | ✅ Yes | Original block number from insert | Exported for row identification |\n" + "| **BlockOffsetColumn (_block_offset)** | ✅ Yes | Original row offset within block | Exported for row identification |\n" + "| **PartDataVersionColumn (_part_data_version)** | ✅ Yes | Data version for mutations | Exported for version tracking |\n" + "| **Virtual Columns** | ✅ Yes | Runtime columns like _part, _partition_id | Generated during export |\n" + "| **System Metadata** | ✅ Yes | Checksums, compression info, serialization | Preserved in export |\n" + "\n" + "[ClickHouse] SHALL handle all mutation and schema change information present in parts:\n" + "\n" + "| Mutation/Schema Type | Supported | Description | Export Behavior |\n" + "|---------------------|------------|-------------|-----------------|\n" + "| **Mutation Commands** | ✅ Yes | DELETE, UPDATE, MATERIALIZE_INDEX, DROP_COLUMN, RENAME_COLUMN | Applied during export |\n" + "| **Alter Conversions** | ✅ Yes | Column renames, type changes, schema modifications | Applied during export |\n" + "| **Patch Parts** | ✅ Yes | Lightweight updates with only changed columns | Applied during export |\n" + "| **Mutation Versions** | ✅ Yes | Version tracking for applied mutations | Preserved in export |\n" + "| **Schema Changes** | ✅ Yes | ALTER MODIFY, ALTER DROP, ALTER RENAME | Applied during export |\n" + "| **TTL Information** | ✅ Yes | Time-to-live settings and expiration data | Preserved in export |\n" + "| **Index Information** | ✅ Yes | Primary key, secondary indices, projections | Preserved in export |\n" + "| **Statistics** | ✅ Yes | Column statistics and sampling information | Preserved in export |\n" + "\n" + "[ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage.\n" + "\n" + ), + link=None, + level=2, + num="11.1", +) + +RQ_ClickHouse_ExportPart_FailureHandling = Requirement( + name="RQ.ClickHouse.ExportPart.FailureHandling", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle export operation failures in the following ways:\n" + "* **Stateless Operation**: Export operations are stateless and ephemeral\n" + "* **No Recovery**: If an export fails, it fails completely with no retry mechanism\n" + "* **No State Persistence**: No export manifests or state are preserved across server restarts\n" + "* **Simple Failure**: Export operations either succeed completely or fail with an error message\n" + "* **No Partial Exports**: Failed exports leave no partial or corrupted data in destination storage\n" + "\n" + ), + link=None, + level=2, + num="12.1", +) + +RQ_ClickHouse_ExportPart_Restrictions_SameTable = Requirement( + name="RQ.ClickHouse.ExportPart.Restrictions.SameTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prevent exporting parts to the same table as the source by:\n" + "* Validating that source and destination table identifiers are different\n" + '* Throwing a `BAD_ARGUMENTS` exception with message "Exporting to the same table is not allowed" when source and destination are identical\n' + "* Performing this validation before any export processing begins\n" + "\n" + ), + link=None, + level=3, + num="13.1.1", +) + +RQ_ClickHouse_ExportPart_Restrictions_DestinationSupport = Requirement( + name="RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate destination table compatibility by:\n" + "\n" + "* Checking that the destination storage supports importing MergeTree parts\n" + "* Verifying that the destination uses Hive partitioning strategy (`partition_strategy = 'hive'`)\n" + '* Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + ), + link=None, + level=3, + num="13.2.1", +) + +RQ_ClickHouse_ExportPart_Restrictions_SourcePart = Requirement( + name="RQ.ClickHouse.ExportPart.Restrictions.SourcePart", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate source part availability by:\n" + "\n" + "* Checking that the specified part exists in the source table\n" + "* Verifying the part is in an active state (not detached or missing)\n" + "* Throwing a `NO_SUCH_DATA_PART` exception with message \"No such data part '{}' to export in table '{}'\" when the part is not found\n" + "* Performing this validation before creating the export manifest\n" + "\n" + ), + link=None, + level=3, + num="13.3.1", +) + +RQ_ClickHouse_ExportPart_Concurrency = Requirement( + name="RQ.ClickHouse.ExportPart.Concurrency", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support concurrent export operations by:\n" + "\n" + "* Allowing multiple exports to run simultaneously without interference\n" + "* Processing export operations asynchronously in the background\n" + "* Preventing race conditions and data corruption during concurrent operations\n" + "* Supporting concurrent exports of different parts to different destinations\n" + "* Preventing concurrent exports of the same part to the same destination\n" + "* Maintaining separate progress tracking and state for each concurrent operation\n" + "* Ensuring thread safety across all concurrent export operations\n" + "\n" + ), + link=None, + level=2, + num="14.1", +) + +RQ_ClickHouse_ExportPart_Idempotency = Requirement( + name="RQ.ClickHouse.ExportPart.Idempotency", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL ensure export operations are idempotent by:\n" + "\n" + "* Allowing the same part to be exported multiple times safely without data corruption\n" + "* Supporting file overwrite control through the `export_merge_tree_part_overwrite_file_if_exists` setting\n" + "* Generating unique file names using part name and checksum to avoid conflicts\n" + "* Maintaining export state consistency across retries\n" + "\n" + ), + link=None, + level=2, + num="15.1", +) + +RQ_ClickHouse_ExportPart_ErrorRecovery_GracefulFailure = Requirement( + name="RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle export failures gracefully by:\n" + "* Allowing users to retry failed export operations\n" + "* Maintaining system stability even when exports fail\n" + "* Not corrupting source data when export operations fail\n" + "* Continuing to process other export operations when one fails\n" + "\n" + ), + link=None, + level=3, + num="16.1.1", +) + +RQ_ClickHouse_ExportPart_ErrorRecovery_AutomaticCleanup = Requirement( + name="RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL automatically clean up failed export operations by:\n" + "* Removing export manifests from the system when operations fail\n" + "* Cleaning up any partial data written to destination storage\n" + "* Releasing system resources (memory, file handles) used by failed exports\n" + "* Updating export status to reflect the failure state\n" + "* Allowing the system to recover and process other export operations\n" + "\n" + ), + link=None, + level=3, + num="16.2.1", +) + +RQ_ClickHouse_ExportPart_Logging = Requirement( + name="RQ.ClickHouse.ExportPart.Logging", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide detailed logging for export operations by:\n" + "* Logging all export operations (both successful and failed) with timestamps and details\n" + "* Recording the specific part name and destination for all operations\n" + "* Including execution time and progress information for all operations\n" + "* Writing operation information to the `system.part_log` table with the following columns:\n" + " * `event_type` - Set to `EXPORT_PART` for export operations\n" + " * `event_time` - Timestamp when the export operation occurred\n" + " * `table` - Source table name\n" + " * `part_name` - Name of the part being exported\n" + " * `path_on_disk` - Path to the part in source storage\n" + " * `duration_ms` - Execution time in milliseconds\n" + " * `error` - Error message if the export failed (empty for successful exports)\n" + " * `thread_id` - Thread ID performing the export\n" + "* Providing sufficient detail for monitoring and troubleshooting export operations\n" + "\n" + ), + link=None, + level=2, + num="17.1", +) + +RQ_ClickHouse_ExportPart_SystemTables_Exports = Requirement( + name="RQ.ClickHouse.ExportPart.SystemTables.Exports", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active and completed export operations, track progress metrics, performance statistics, and troubleshoot export issues with the following columns:\n" + "\n" + "* `source_database`, `source_table` - source table identifiers\n" + "* `destination_database`, `destination_table` - destination table identifiers \n" + "* `create_time` - when export was submitted\n" + "* `part_name` - name of the exported part\n" + "* `destination_file_path` - path in destination storage\n" + "* `elapsed` - execution time in seconds\n" + "* `rows_read`, `total_rows_to_read` - progress metrics\n" + "* `total_size_bytes_compressed`, `total_size_bytes_uncompressed` - size metrics\n" + "* `bytes_read_uncompressed` - bytes processed\n" + "* `memory_usage`, `peak_memory_usage` - memory consumption\n" + "\n" + ), + link=None, + level=2, + num="18.1", +) + +RQ_ClickHouse_ExportPart_Settings_AllowExperimental = Requirement( + name="RQ.ClickHouse.ExportPart.Settings.AllowExperimental", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `allow_experimental_export_merge_tree_part` setting that SHALL gate the experimental export part functionality, which SHALL be set to `1` to enable `ALTER TABLE ... EXPORT PART ...` commands. The default value SHALL be `0` (turned off).\n" + "\n" + ), + link=None, + level=2, + num="19.1", +) + +RQ_ClickHouse_ExportPart_Settings_OverwriteFile = Requirement( + name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `export_merge_tree_part_overwrite_file_if_exists` setting that controls whether to overwrite files if they already exist when exporting a merge tree part. The default value SHALL be `0` (turned off).\n" + "\n" + ), + link=None, + level=2, + num="20.1", +) + +RQ_ClickHouse_ExportPart_ParallelFormatting = Requirement( + name="RQ.ClickHouse.ExportPart.ParallelFormatting", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support parallel formatting for export operations by:\n" + "* Automatically enabling parallel formatting for large export operations to improve performance\n" + "* Using the `output_format_parallel_formatting` setting to control parallel formatting behavior\n" + "* Optimizing data processing based on export size and system resources\n" + "* Providing consistent formatting performance across different export scenarios\n" + "\n" + ), + link=None, + level=2, + num="21.1", +) + +RQ_ClickHouse_ExportPart_ServerSettings_MaxBandwidth = Requirement( + name="RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file.\n" + "\n" + ), + link=None, + level=2, + num="22.1", +) + +RQ_ClickHouse_ExportPart_Events = Requirement( + name="RQ.ClickHouse.ExportPart.Events", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide the following export-related events in the `system.events` table:\n" + "* `PartsExports` - Number of successful part exports\n" + "* `PartsExportFailures` - Number of failed part exports \n" + "* `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" + "* `PartsExportTotalMilliseconds` - Total time spent on part export operations in milliseconds\n" + "* `ExportsThrottlerBytes` - Bytes passed through the exports throttler\n" + "* `ExportsThrottlerSleepMicroseconds` - Total time queries were sleeping to conform to export bandwidth throttling\n" + "\n" + ), + link=None, + level=2, + num="23.1", +) + +RQ_ClickHouse_ExportPart_Metrics_Export = Requirement( + name="RQ.ClickHouse.ExportPart.Metrics.Export", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide the `Export` current metric in `system.metrics` table that tracks the number of currently executing exports.\n" + "\n" + ), + link=None, + level=2, + num="23.2", +) + +RQ_ClickHouse_ExportPart_Security = Requirement( + name="RQ.ClickHouse.ExportPart.Security", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL enforce security requirements for export operations:\n" + "* **RBAC**: Users must have the following privileges:\n" + " * **Source Table**: `SELECT` privilege on the source table to read data parts\n" + " * **Destination Table**: `INSERT` privilege on the destination table to write exported data\n" + " * **Database Access**: `SHOW` privilege on both source and destination databases\n" + " * **System Tables**: `SELECT` privilege on `system.tables` to validate table existence\n" + "* **Data Encryption**: All data in transit to destination storage must be encrypted using TLS/SSL\n" + "* **Network Security**: Export operations must use secure connections to destination storage (HTTPS for S3, secure protocols for other storage)\n" + "* **Credential Management**: Export operations must use secure credential storage and avoid exposing credentials in logs\n" + "\n" + "\n" + "[ClickHouse]: https://clickhouse.com\n" + ), + link=None, + level=2, + num="24.1", +) + +SRS_015_ClickHouse_Export_Part_to_S3 = Specification( + name="SRS-015 ClickHouse Export Part to S3", + description=None, + author=None, + date=None, + status=None, + approved_by=None, + approved_date=None, + approved_version=None, + version=None, + group=None, + type=None, + link=None, + uid=None, + parent=None, + children=None, + headings=( + Heading(name="Introduction", level=1, num="1"), + Heading(name="Exporting Parts to S3", level=1, num="2"), + Heading(name="RQ.ClickHouse.ExportPart.S3", level=2, num="2.1"), + Heading(name="SQL command support", level=1, num="3"), + Heading(name="RQ.ClickHouse.ExportPart.SQLCommand", level=2, num="3.1"), + Heading(name="Supported source table engines", level=1, num="4"), + Heading(name="RQ.ClickHouse.ExportPart.SourceEngines", level=2, num="4.1"), + Heading(name="Supported source part storage types", level=1, num="5"), + Heading(name="RQ.ClickHouse.ExportPart.SourcePartStorage", level=2, num="5.1"), + Heading(name="Supported destination table engines", level=1, num="6"), + Heading(name="RQ.ClickHouse.ExportPart.DestinationEngines", level=2, num="6.1"), + Heading(name="Destination setup and file management", level=1, num="7"), + Heading(name="RQ.ClickHouse.ExportPart.DestinationSetup", level=2, num="7.1"), + Heading(name="Export data preparation", level=1, num="8"), + Heading(name="RQ.ClickHouse.ExportPart.DataPreparation", level=2, num="8.1"), + Heading(name="Schema compatibility", level=1, num="9"), + Heading( + name="RQ.ClickHouse.ExportPart.SchemaCompatibility", level=2, num="9.1" + ), + Heading(name="Partition key types support", level=1, num="10"), + Heading(name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", level=2, num="10.1"), + Heading(name="Part types and content support", level=1, num="11"), + Heading(name="RQ.ClickHouse.ExportPart.PartTypes", level=2, num="11.1"), + Heading(name="Export operation failure handling", level=1, num="12"), + Heading(name="RQ.ClickHouse.ExportPart.FailureHandling", level=2, num="12.1"), + Heading(name="Export operation restrictions", level=1, num="13"), + Heading(name="Preventing same table exports", level=2, num="13.1"), + Heading( + name="RQ.ClickHouse.ExportPart.Restrictions.SameTable", + level=3, + num="13.1.1", + ), + Heading(name="Destination table compatibility", level=2, num="13.2"), + Heading( + name="RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport", + level=3, + num="13.2.1", + ), + Heading(name="Source part availability", level=2, num="13.3"), + Heading( + name="RQ.ClickHouse.ExportPart.Restrictions.SourcePart", + level=3, + num="13.3.1", + ), + Heading(name="Export operation concurrency", level=1, num="14"), + Heading(name="RQ.ClickHouse.ExportPart.Concurrency", level=2, num="14.1"), + Heading(name="Export operation idempotency", level=1, num="15"), + Heading(name="RQ.ClickHouse.ExportPart.Idempotency", level=2, num="15.1"), + Heading(name="Export operation error recovery", level=1, num="16"), + Heading(name="Graceful failure handling", level=2, num="16.1"), + Heading( + name="RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure", + level=3, + num="16.1.1", + ), + Heading(name="Automatic cleanup on failure", level=2, num="16.2"), + Heading( + name="RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup", + level=3, + num="16.2.1", + ), + Heading(name="Export operation logging", level=1, num="17"), + Heading(name="RQ.ClickHouse.ExportPart.Logging", level=2, num="17.1"), + Heading(name="Monitoring export operations", level=1, num="18"), + Heading( + name="RQ.ClickHouse.ExportPart.SystemTables.Exports", level=2, num="18.1" + ), + Heading(name="Enabling export functionality", level=1, num="19"), + Heading( + name="RQ.ClickHouse.ExportPart.Settings.AllowExperimental", + level=2, + num="19.1", + ), + Heading(name="Handling file conflicts during export", level=1, num="20"), + Heading( + name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", level=2, num="20.1" + ), + Heading(name="Export operation configuration", level=1, num="21"), + Heading( + name="RQ.ClickHouse.ExportPart.ParallelFormatting", level=2, num="21.1" + ), + Heading(name="Controlling export performance", level=1, num="22"), + Heading( + name="RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth", + level=2, + num="22.1", + ), + Heading(name="Monitoring export performance metrics", level=1, num="23"), + Heading(name="RQ.ClickHouse.ExportPart.Events", level=2, num="23.1"), + Heading(name="RQ.ClickHouse.ExportPart.Metrics.Export", level=2, num="23.2"), + Heading(name="Export operation security", level=1, num="24"), + Heading(name="RQ.ClickHouse.ExportPart.Security", level=2, num="24.1"), + ), + requirements=( + RQ_ClickHouse_ExportPart_S3, + RQ_ClickHouse_ExportPart_SQLCommand, + RQ_ClickHouse_ExportPart_SourceEngines, + RQ_ClickHouse_ExportPart_SourcePartStorage, + RQ_ClickHouse_ExportPart_DestinationEngines, + RQ_ClickHouse_ExportPart_DestinationSetup, + RQ_ClickHouse_ExportPart_DataPreparation, + RQ_ClickHouse_ExportPart_SchemaCompatibility, + RQ_ClickHouse_ExportPart_PartitionKeyTypes, + RQ_ClickHouse_ExportPart_PartTypes, + RQ_ClickHouse_ExportPart_FailureHandling, + RQ_ClickHouse_ExportPart_Restrictions_SameTable, + RQ_ClickHouse_ExportPart_Restrictions_DestinationSupport, + RQ_ClickHouse_ExportPart_Restrictions_SourcePart, + RQ_ClickHouse_ExportPart_Concurrency, + RQ_ClickHouse_ExportPart_Idempotency, + RQ_ClickHouse_ExportPart_ErrorRecovery_GracefulFailure, + RQ_ClickHouse_ExportPart_ErrorRecovery_AutomaticCleanup, + RQ_ClickHouse_ExportPart_Logging, + RQ_ClickHouse_ExportPart_SystemTables_Exports, + RQ_ClickHouse_ExportPart_Settings_AllowExperimental, + RQ_ClickHouse_ExportPart_Settings_OverwriteFile, + RQ_ClickHouse_ExportPart_ParallelFormatting, + RQ_ClickHouse_ExportPart_ServerSettings_MaxBandwidth, + RQ_ClickHouse_ExportPart_Events, + RQ_ClickHouse_ExportPart_Metrics_Export, + RQ_ClickHouse_ExportPart_Security, + ), + content=r""" +# SRS-015 ClickHouse Export Part to S3 +# Software Requirements Specification + +## Table of Contents + +* 1 [Introduction](#introduction) +* 2 [Exporting Parts to S3](#exporting-parts-to-s3) + * 2.1 [RQ.ClickHouse.ExportPart.S3](#rqclickhouseexportparts3) +* 3 [SQL command support](#sql-command-support) + * 3.1 [RQ.ClickHouse.ExportPart.SQLCommand](#rqclickhouseexportpartsqlcommand) +* 4 [Supported source table engines](#supported-source-table-engines) + * 4.1 [RQ.ClickHouse.ExportPart.SourceEngines](#rqclickhouseexportpartsourceengines) +* 5 [Supported source part storage types](#supported-source-part-storage-types) + * 5.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) +* 6 [Supported destination table engines](#supported-destination-table-engines) + * 6.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) +* 7 [Destination setup and file management](#destination-setup-and-file-management) + * 7.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) +* 8 [Export data preparation](#export-data-preparation) + * 8.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) +* 9 [Schema compatibility](#schema-compatibility) + * 9.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) +* 10 [Partition key types support](#partition-key-types-support) + * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) +* 11 [Part types and content support](#part-types-and-content-support) + * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) +* 13 [Export operation restrictions](#export-operation-restrictions) + * 13.1 [Preventing same table exports](#preventing-same-table-exports) + * 13.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) + * 13.2 [Destination table compatibility](#destination-table-compatibility) + * 13.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) + * 13.3 [Source part availability](#source-part-availability) + * 13.3.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) +* 14 [Export operation concurrency](#export-operation-concurrency) + * 14.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) +* 15 [Export operation idempotency](#export-operation-idempotency) + * 15.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) +* 16 [Export operation error recovery](#export-operation-error-recovery) + * 16.1 [Graceful failure handling](#graceful-failure-handling) + * 16.1.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure](#rqclickhouseexportparterrorrecoverygracefulfailure) + * 16.2 [Automatic cleanup on failure](#automatic-cleanup-on-failure) + * 16.2.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup](#rqclickhouseexportparterrorrecoveryautomaticcleanup) +* 17 [Export operation logging](#export-operation-logging) + * 17.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) +* 18 [Monitoring export operations](#monitoring-export-operations) + * 18.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) +* 19 [Enabling export functionality](#enabling-export-functionality) + * 19.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) +* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 20.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) +* 21 [Export operation configuration](#export-operation-configuration) + * 21.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) +* 22 [Controlling export performance](#controlling-export-performance) + * 22.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) +* 23 [Monitoring export performance metrics](#monitoring-export-performance-metrics) + * 23.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) + * 23.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) +* 24 [Export operation security](#export-operation-security) + * 24.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) + +## Introduction + +This specification defines requirements for exporting individual MergeTree data parts to S3-compatible object storage. + +## Exporting Parts to S3 + +### RQ.ClickHouse.ExportPart.S3 +version: 1.0 + +[ClickHouse] SHALL support exporting data parts from MergeTree engine tables to S3 object storage. + +## SQL command support + +### RQ.ClickHouse.ExportPart.SQLCommand +version: 1.0 + +[ClickHouse] SHALL support the following SQL command syntax for exporting MergeTree data parts to object storage tables: + +```sql +ALTER TABLE [database.]source_table_name +EXPORT PART 'part_name' +TO TABLE [database.]destination_table_name +``` + +**Parameters:** +- `source_table_name`: Name of the source MergeTree table +- `part_name`: Name of the specific part to export (string literal) +- `destination_table_name`: Name of the destination object storage table + +## Supported source table engines + +### RQ.ClickHouse.ExportPart.SourceEngines +version: 1.0 + +[ClickHouse] SHALL support exporting from the following source table engines: +* `MergeTree` - Base MergeTree engine +* `ReplicatedMergeTree` - Replicated MergeTree engine with ZooKeeper coordination +* `SummingMergeTree` - MergeTree with automatic summation of numeric columns +* `AggregatingMergeTree` - MergeTree with pre-aggregated data +* `CollapsingMergeTree` - MergeTree with row versioning for updates +* `VersionedCollapsingMergeTree` - CollapsingMergeTree with version tracking +* `GraphiteMergeTree` - MergeTree optimized for Graphite data +* All other MergeTree family engines that inherit from `MergeTreeData` + +## Supported source part storage types + +### RQ.ClickHouse.ExportPart.SourcePartStorage +version: 1.0 + +[ClickHouse] SHALL support exporting data parts regardless of the underlying storage type where the source parts are stored, including: +* **Local Disks**: Parts stored on local filesystem +* **S3/Object Storage**: Parts stored on S3 or S3-compatible object storage +* **Encrypted Disks**: Parts stored on encrypted disks (disk-level encryption) +* **Cached Disks**: Parts stored with filesystem cache enabled +* **Remote Disks**: Parts stored on HDFS, Azure Blob Storage, or Google Cloud Storage +* **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold) +* **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled + +## Supported destination table engines + +### RQ.ClickHouse.ExportPart.DestinationEngines +version: 1.0 + +[ClickHouse] SHALL support exporting to destination tables that: +* Support object storage engines including: + * `S3` - Amazon S3 and S3-compatible storage + * `StorageObjectStorage` - Generic object storage interface + * `HDFS` - Hadoop Distributed File System (with Hive partitioning) + * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) + * `GCS` - Google Cloud Storage (with Hive partitioning) +* Implement the `supportsImport()` method and return `true` + +## Destination setup and file management + +### RQ.ClickHouse.ExportPart.DestinationSetup +version: 1.0 + +[ClickHouse] SHALL handle destination setup and file management by: +* Creating appropriate import sinks for destination storage systems +* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts +* Allowing destination storage to determine the final file path based on Hive partitioning +* Creating files in the destination storage that users can observe and access +* Providing the final destination file path in the `system.exports` table for monitoring + +## Export data preparation + +### RQ.ClickHouse.ExportPart.DataPreparation +version: 1.0 + +[ClickHouse] SHALL prepare data for export by: +* Automatically selecting all physical columns from source table metadata +* Extracting partition key values for proper Hive partitioning in destination + +## Schema compatibility + +### RQ.ClickHouse.ExportPart.SchemaCompatibility +version: 1.0 + +[ClickHouse] SHALL require source and destination tables to have compatible schemas for successful export operations: +* Identical physical column schemas between source and destination +* The same partition key expression in both tables +* Compatible data types for all columns +* Matching column order and names + +## Partition key types support + +### RQ.ClickHouse.ExportPart.PartitionKeyTypes +version: 1.0 + +[ClickHouse] SHALL support export operations for tables with partition key types that are compatible with Hive partitioning, as shown in the following table: + +| Partition Key Type | Supported | Examples | Notes | +|-------------------|------------|----------|-------| +| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported | +| **Date/DateTime Types** | ✅ Yes | `Date`, `DateTime`, `DateTime64` | All date/time types supported | +| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported | +| **Date Functions** | ✅ Yes | `toYYYYMM(date_col)`, `toMonday(date_col)`, `toYear(date_col)` | Result in supported types | +| **Mathematical Expressions** | ✅ Yes | `column1 + column2`, `column * 1000` | If result is supported type | +| **String Functions** | ✅ Yes | `substring(column, 1, 4)` | Result in String type | +| **Tuple Expressions** | ✅ Yes | `(toMonday(StartDate), EventType)` | If all elements are supported types | +| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported | +| **UUID Types** | ❌ No | `UUID` | Not supported by Hive partitioning | +| **Enum Types** | ❌ No | `Enum8`, `Enum16` | Not supported by Hive partitioning | +| **Floating-point Types** | ❌ No | `Float32`, `Float64` | Not supported by Hive partitioning | +| **Hash Functions** | ❌ No | `intHash32(column)`, `cityHash64(column)` | Result in unsupported types | + +[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements. + +[ClickHouse] SHALL require destination tables to support Hive partitioning, which limits the supported partition key types to Integer, Date/DateTime, and String types. Complex expressions that result in unsupported types are not supported for export operations. + +## Part types and content support + +### RQ.ClickHouse.ExportPart.PartTypes +version: 1.0 + +[ClickHouse] SHALL support export operations for all valid MergeTree part types and their contents, including: + +| Part Type | Supported | Description | Special Features | +|-----------|------------|-------------|------------------| +| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | +| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | +| **Regular Parts** | ✅ Yes | Standard data parts created by inserts, merges, mutations | Full data content | +| **Patch Parts** | ✅ Yes | Lightweight update parts containing only changed columns | Applied during export | +| **Active Parts** | ✅ Yes | Currently active data parts | Primary export target | +| **Outdated Parts** | ✅ Yes | Parts that have been replaced by newer versions | Can be exported for backup | + +[ClickHouse] SHALL handle all special columns and metadata present in parts during export: + +| Column Type | Supported | Description | Export Behavior | +|-------------|------------|-------------|-----------------| +| **Physical Columns** | ✅ Yes | User-defined table columns | All physical columns exported | +| **RowExistsColumn (_row_exists)** | ✅ Yes | Lightweight delete mask showing row existence | Exported to maintain delete state | +| **BlockNumberColumn (_block_number)** | ✅ Yes | Original block number from insert | Exported for row identification | +| **BlockOffsetColumn (_block_offset)** | ✅ Yes | Original row offset within block | Exported for row identification | +| **PartDataVersionColumn (_part_data_version)** | ✅ Yes | Data version for mutations | Exported for version tracking | +| **Virtual Columns** | ✅ Yes | Runtime columns like _part, _partition_id | Generated during export | +| **System Metadata** | ✅ Yes | Checksums, compression info, serialization | Preserved in export | + +[ClickHouse] SHALL handle all mutation and schema change information present in parts: + +| Mutation/Schema Type | Supported | Description | Export Behavior | +|---------------------|------------|-------------|-----------------| +| **Mutation Commands** | ✅ Yes | DELETE, UPDATE, MATERIALIZE_INDEX, DROP_COLUMN, RENAME_COLUMN | Applied during export | +| **Alter Conversions** | ✅ Yes | Column renames, type changes, schema modifications | Applied during export | +| **Patch Parts** | ✅ Yes | Lightweight updates with only changed columns | Applied during export | +| **Mutation Versions** | ✅ Yes | Version tracking for applied mutations | Preserved in export | +| **Schema Changes** | ✅ Yes | ALTER MODIFY, ALTER DROP, ALTER RENAME | Applied during export | +| **TTL Information** | ✅ Yes | Time-to-live settings and expiration data | Preserved in export | +| **Index Information** | ✅ Yes | Primary key, secondary indices, projections | Preserved in export | +| **Statistics** | ✅ Yes | Column statistics and sampling information | Preserved in export | + +[ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage. + +## Export operation failure handling + +### RQ.ClickHouse.ExportPart.FailureHandling +version: 1.0 + +[ClickHouse] SHALL handle export operation failures in the following ways: +* **Stateless Operation**: Export operations are stateless and ephemeral +* **No Recovery**: If an export fails, it fails completely with no retry mechanism +* **No State Persistence**: No export manifests or state are preserved across server restarts +* **Simple Failure**: Export operations either succeed completely or fail with an error message +* **No Partial Exports**: Failed exports leave no partial or corrupted data in destination storage + +## Export operation restrictions + +### Preventing same table exports + +#### RQ.ClickHouse.ExportPart.Restrictions.SameTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting parts to the same table as the source by: +* Validating that source and destination table identifiers are different +* Throwing a `BAD_ARGUMENTS` exception with message "Exporting to the same table is not allowed" when source and destination are identical +* Performing this validation before any export processing begins + +### Destination table compatibility + +#### RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport +version: 1.0 + +[ClickHouse] SHALL validate destination table compatibility by: + +* Checking that the destination storage supports importing MergeTree parts +* Verifying that the destination uses Hive partitioning strategy (`partition_strategy = 'hive'`) +* Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met +* Performing this validation during the initial export setup phase + +### Source part availability + +#### RQ.ClickHouse.ExportPart.Restrictions.SourcePart +version: 1.0 + +[ClickHouse] SHALL validate source part availability by: + +* Checking that the specified part exists in the source table +* Verifying the part is in an active state (not detached or missing) +* Throwing a `NO_SUCH_DATA_PART` exception with message "No such data part '{}' to export in table '{}'" when the part is not found +* Performing this validation before creating the export manifest + +## Export operation concurrency + +### RQ.ClickHouse.ExportPart.Concurrency +version: 1.0 + +[ClickHouse] SHALL support concurrent export operations by: + +* Allowing multiple exports to run simultaneously without interference +* Processing export operations asynchronously in the background +* Preventing race conditions and data corruption during concurrent operations +* Supporting concurrent exports of different parts to different destinations +* Preventing concurrent exports of the same part to the same destination +* Maintaining separate progress tracking and state for each concurrent operation +* Ensuring thread safety across all concurrent export operations + +## Export operation idempotency + +### RQ.ClickHouse.ExportPart.Idempotency +version: 1.0 + +[ClickHouse] SHALL ensure export operations are idempotent by: + +* Allowing the same part to be exported multiple times safely without data corruption +* Supporting file overwrite control through the `export_merge_tree_part_overwrite_file_if_exists` setting +* Generating unique file names using part name and checksum to avoid conflicts +* Maintaining export state consistency across retries + +## Export operation error recovery + +### Graceful failure handling + +#### RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure +version: 1.0 + +[ClickHouse] SHALL handle export failures gracefully by: +* Allowing users to retry failed export operations +* Maintaining system stability even when exports fail +* Not corrupting source data when export operations fail +* Continuing to process other export operations when one fails + +### Automatic cleanup on failure + +#### RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup +version: 1.0 + +[ClickHouse] SHALL automatically clean up failed export operations by: +* Removing export manifests from the system when operations fail +* Cleaning up any partial data written to destination storage +* Releasing system resources (memory, file handles) used by failed exports +* Updating export status to reflect the failure state +* Allowing the system to recover and process other export operations + +## Export operation logging + +### RQ.ClickHouse.ExportPart.Logging +version: 1.0 + +[ClickHouse] SHALL provide detailed logging for export operations by: +* Logging all export operations (both successful and failed) with timestamps and details +* Recording the specific part name and destination for all operations +* Including execution time and progress information for all operations +* Writing operation information to the `system.part_log` table with the following columns: + * `event_type` - Set to `EXPORT_PART` for export operations + * `event_time` - Timestamp when the export operation occurred + * `table` - Source table name + * `part_name` - Name of the part being exported + * `path_on_disk` - Path to the part in source storage + * `duration_ms` - Execution time in milliseconds + * `error` - Error message if the export failed (empty for successful exports) + * `thread_id` - Thread ID performing the export +* Providing sufficient detail for monitoring and troubleshooting export operations + +## Monitoring export operations + +### RQ.ClickHouse.ExportPart.SystemTables.Exports +version: 1.0 + +[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active and completed export operations, track progress metrics, performance statistics, and troubleshoot export issues with the following columns: + +* `source_database`, `source_table` - source table identifiers +* `destination_database`, `destination_table` - destination table identifiers +* `create_time` - when export was submitted +* `part_name` - name of the exported part +* `destination_file_path` - path in destination storage +* `elapsed` - execution time in seconds +* `rows_read`, `total_rows_to_read` - progress metrics +* `total_size_bytes_compressed`, `total_size_bytes_uncompressed` - size metrics +* `bytes_read_uncompressed` - bytes processed +* `memory_usage`, `peak_memory_usage` - memory consumption + +## Enabling export functionality + +### RQ.ClickHouse.ExportPart.Settings.AllowExperimental +version: 1.0 + +[ClickHouse] SHALL support the `allow_experimental_export_merge_tree_part` setting that SHALL gate the experimental export part functionality, which SHALL be set to `1` to enable `ALTER TABLE ... EXPORT PART ...` commands. The default value SHALL be `0` (turned off). + +## Handling file conflicts during export + +### RQ.ClickHouse.ExportPart.Settings.OverwriteFile +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_part_overwrite_file_if_exists` setting that controls whether to overwrite files if they already exist when exporting a merge tree part. The default value SHALL be `0` (turned off). + +## Export operation configuration + +### RQ.ClickHouse.ExportPart.ParallelFormatting +version: 1.0 + +[ClickHouse] SHALL support parallel formatting for export operations by: +* Automatically enabling parallel formatting for large export operations to improve performance +* Using the `output_format_parallel_formatting` setting to control parallel formatting behavior +* Optimizing data processing based on export size and system resources +* Providing consistent formatting performance across different export scenarios + +## Controlling export performance + +### RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth +version: 1.0 + +[ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file. + +## Monitoring export performance metrics + +### RQ.ClickHouse.ExportPart.Events +version: 1.0 + +[ClickHouse] SHALL provide the following export-related events in the `system.events` table: +* `PartsExports` - Number of successful part exports +* `PartsExportFailures` - Number of failed part exports +* `PartsExportDuplicated` - Number of part exports that failed because target already exists +* `PartsExportTotalMilliseconds` - Total time spent on part export operations in milliseconds +* `ExportsThrottlerBytes` - Bytes passed through the exports throttler +* `ExportsThrottlerSleepMicroseconds` - Total time queries were sleeping to conform to export bandwidth throttling + +### RQ.ClickHouse.ExportPart.Metrics.Export +version: 1.0 + +[ClickHouse] SHALL provide the `Export` current metric in `system.metrics` table that tracks the number of currently executing exports. + +## Export operation security + +### RQ.ClickHouse.ExportPart.Security +version: 1.0 + +[ClickHouse] SHALL enforce security requirements for export operations: +* **RBAC**: Users must have the following privileges: + * **Source Table**: `SELECT` privilege on the source table to read data parts + * **Destination Table**: `INSERT` privilege on the destination table to write exported data + * **Database Access**: `SHOW` privilege on both source and destination databases + * **System Tables**: `SELECT` privilege on `system.tables` to validate table existence +* **Data Encryption**: All data in transit to destination storage must be encrypted using TLS/SSL +* **Network Security**: Export operations must use secure connections to destination storage (HTTPS for S3, secure protocols for other storage) +* **Credential Management**: Export operations must use secure credential storage and avoid exposing credentials in logs + + +[ClickHouse]: https://clickhouse.com +""", +) From 3f60ce7dc4614a8948e70757de16e4eccf9f5a8c Mon Sep 17 00:00:00 2001 From: Selfeer Date: Tue, 28 Oct 2025 17:25:10 +0400 Subject: [PATCH 32/99] push requirements --- s3/tests/export_part/engines.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 731f52929..0d9337cbf 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -1,9 +1,9 @@ from testflows.core import * from testflows.asserts import error from s3.tests.export_part.steps import * +from s3.requirements.export_part import * - -@TestScenario +@TestCheck def configured_table(self, table_engine, number_of_partitions, number_of_parts): """Test a specific combination of table engine, number of partitions, and number of parts.""" @@ -40,6 +40,7 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): @TestSketch(Scenario) @Flags(TE) +@Requirements(RQ_ClickHouse_ExportPart_SourceEngines("1.0")) def table_combos(self): """Test various combinations of table engines, number of partitions, and number of parts.""" From 2a00de58381ad4c5e4873250645fa76bf8837b25 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 28 Oct 2025 15:38:41 -0400 Subject: [PATCH 33/99] Initial datatypes tests, duplicate test, cleanup --- helpers/queries.py | 7 +++ s3/tests/export_part/clusters_and_nodes.py | 5 +- s3/tests/export_part/datatypes.py | 67 ++++++++++++++++++++ s3/tests/export_part/engines.py | 9 ++- s3/tests/export_part/error_handling.py | 41 ++++++++++++- s3/tests/export_part/feature.py | 9 +-- s3/tests/export_part/sanity.py | 18 ++++-- s3/tests/export_part/steps.py | 21 ++----- s3/tests/export_part/system_monitoring.py | 71 ++++++++++++---------- 9 files changed, 183 insertions(+), 65 deletions(-) create mode 100644 s3/tests/export_part/datatypes.py diff --git a/helpers/queries.py b/helpers/queries.py index de142476b..a8d065166 100644 --- a/helpers/queries.py +++ b/helpers/queries.py @@ -16,6 +16,13 @@ # The extra [0] could be avoided with TSV format, but that does not guarantee valid JSON. +@TestStep(When) +def select_all_ordered(self, table_name, node): + """Select all data from a table ordered by partition and index columns.""" + + return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output + + @TestStep(When) def sync_replica( self, node: ClickHouseNode, table_name: str, raise_on_timeout=False, **kwargs diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py index cba836011..3d027bc64 100644 --- a/s3/tests/export_part/clusters_and_nodes.py +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -4,6 +4,7 @@ from testflows.core import * from testflows.asserts import error from s3.tests.export_part.steps import * +from helpers.queries import * from alter.table.replace_partition.common import create_partitions_with_random_uint64 @@ -15,13 +16,13 @@ def different_nodes_same_destination(self, cluster, node1, node2): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(), + columns=default_columns(simple=False), stop_merges=True, populate=False, cluster=cluster, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, cluster=cluster + table_name="s3", create_new_bucket=True, cluster=cluster, columns=default_columns(simple=False) ) with And("I populate the source tables on both nodes"): diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py new file mode 100644 index 000000000..33c3893b2 --- /dev/null +++ b/s3/tests/export_part/datatypes.py @@ -0,0 +1,67 @@ +from testflows.core import * +from s3.tests.export_part.steps import * +from helpers.create import * +from helpers.queries import * + + +@TestCheck +def configured_table(self, partition_key_type): + with Given(f"I create a populated source table with partition key type {partition_key_type} and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(partition_key_type=partition_key_type), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, columns=default_columns(partition_key_type=partition_key_type) + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("They should be the same"): + assert source_data == destination_data, error() + + +@TestSketch(Scenario) +@Flags(TE) +def basic_partition_key_types(self): + """Check that all partition key data types are supported when exporting parts.""" + + key_types = [ + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", + "Date", + "DateTime", + "DateTime64", + "String", + # "FixedString(1)", + ] + + configured_table(partition_key_type=either(*key_types)) + + +@TestFeature +@Name("datatypes") +def feature(self): + """Check that all data types are supported when exporting parts.""" + + Scenario(run=basic_partition_key_types) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 0d9337cbf..a58e71b08 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -2,6 +2,11 @@ from testflows.asserts import error from s3.tests.export_part.steps import * from s3.requirements.export_part import * +from helpers.queries import * + + +# TODO replicated merge tree tables (all types) + @TestCheck def configured_table(self, table_engine, number_of_partitions, number_of_parts): @@ -15,10 +20,10 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): populate=True, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, - columns=default_columns(), + columns=default_columns(simple=False), ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, columns=default_columns() + table_name="s3", create_new_bucket=True, columns=default_columns(simple=False) ) with When("I export parts to the S3 table"): diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 1bccc5a81..427aa1841 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -1,7 +1,7 @@ from testflows.core import * from testflows.asserts import error from s3.tests.export_part.steps import * -from helpers.tables import * +from helpers.queries import * @TestScenario @@ -12,12 +12,12 @@ def invalid_part_name(self): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(simple=True), + columns=default_columns(), stop_merges=True, populate=True, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, simple_columns=True + table_name="s3", create_new_bucket=True ) with And("I create an invalid part name"): @@ -39,9 +39,44 @@ def invalid_part_name(self): ), error() +@TestScenario +def duplicate_exports(self): + """Check duplicate exports are ignored and not exported again.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True + ) + + with When("I try to export the parts twice"): + results1 = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + results2 = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("The source and destination tables should still be the same"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) + assert source_data == destination_data, error() + + @TestFeature @Name("error handling") def feature(self): """Check correct error handling when exporting parts.""" Scenario(run=invalid_part_name) + Scenario(run=duplicate_exports) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 59a1210fc..7b151663c 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,8 +12,9 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 677245803..0ba850e6c 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -2,6 +2,8 @@ from testflows.asserts import error from s3.tests.export_part.steps import * from helpers.create import * +from helpers.queries import * +from s3.requirements.export_part import * @TestScenario @@ -12,12 +14,12 @@ def mismatched_columns(self): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(simple=True), + columns=default_columns(), stop_merges=True, populate=True, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, simple_columns=False + table_name="s3", create_new_bucket=True ) with When("I export parts to the S3 table"): @@ -34,6 +36,10 @@ def mismatched_columns(self): @TestScenario +@Requirements( + RQ_ClickHouse_ExportPart_S3("1.0"), + RQ_ClickHouse_ExportPart_SQLCommand("1.0"), +) def basic_table(self): """Test exporting parts of a basic table.""" @@ -41,12 +47,12 @@ def basic_table(self): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(simple=True), + columns=default_columns(), stop_merges=True, populate=True, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, simple_columns=True + table_name="s3", create_new_bucket=True ) with When("I export parts to the S3 table"): @@ -74,12 +80,12 @@ def empty_table(self): partitioned_merge_tree_table( table_name="empty_source", partition_by="p", - columns=default_columns(simple=True), + columns=default_columns(), stop_merges=False, populate=False, ) s3_table_name = create_s3_table( - table_name="empty_s3", create_new_bucket=True, simple_columns=True + table_name="empty_s3", create_new_bucket=True ) with When("I export parts to the S3 table"): diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 4fb47aec4..f3ba8c1ac 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -7,9 +7,9 @@ from s3.tests.common import temporary_bucket_path -def default_columns(simple=False): +def default_columns(simple=True, partition_key_type="Int8"): columns = [ - {"name": "p", "type": "Int8"}, + {"name": "p", "type": partition_key_type}, {"name": "i", "type": "UInt64"}, {"name": "Path", "type": "String"}, {"name": "Time", "type": "DateTime"}, @@ -18,9 +18,9 @@ def default_columns(simple=False): ] if simple: - columns = columns[:2] - - return columns + return columns[:2] + else: + return columns @TestStep(Given) @@ -41,7 +41,6 @@ def create_s3_table( cluster=None, create_new_bucket=False, columns=None, - simple_columns=False, ): """Create a destination S3 table.""" @@ -49,7 +48,7 @@ def create_s3_table( create_temp_bucket() if columns is None: - columns = default_columns(simple=simple_columns) + columns = default_columns(simple=True) table_name = f"{table_name}_{getuid()}" engine = f""" @@ -64,7 +63,6 @@ def create_s3_table( ) """ - # TODO columns and partition_by are hardcoded for now, but i should make them configurable create_table( table_name=table_name, columns=columns, @@ -101,13 +99,6 @@ def get_parts(self, table_name, node): return [row.strip() for row in output.splitlines()] -@TestStep(When) -def select_all_ordered(self, table_name, node): - """Select all data from a table ordered by partition and index columns.""" - - return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output - - @TestStep(When) def export_parts(self, source_table, destination_table, node, parts=None, exitcode=0): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 0ff2d3712..c92954c37 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -8,39 +8,44 @@ # partsexports incrementing correctly # duplicates incrementing correctly -@TestScenario -def duplicate_exports(self): - """Check duplicate export attempts are properly tracked in system.events.""" - - with Given("I create source and destination tables"): - source, destination = create_source_and_destination_tables() - - with When("I insert random test data into the source table"): - source.insert_test_data() # default row_count=10, cardinality=1 - - with And("I get a list of parts for source table"): - source_parts = source.get_parts() - test_part = source_parts[1] - - with And("I read initial export events"): - events_initial = export_events() - initial_exports = events_initial.get("PartsExports", 0) - initial_duplicates = events_initial.get("PartsExportDuplicated", 0) - with When("I export the same part twice"): - export_part(parts=[test_part], source=source, destination=destination) - export_part(parts=[test_part], source=source, destination=destination) - - with Then("I check system.events for duplicate tracking"): - events_final = export_events() - final_exports = events_final.get("PartsExports", 0) - final_duplicates = events_final.get("PartsExportDuplicated", 0) - - with By("Checking we have 1 successful export"): - assert final_exports - initial_exports == 1, error() - - with And("Checking we have 1 duplicate export"): - assert final_duplicates - initial_duplicates == 1, error() +@TestScenario +def part_exports(self): + """Check part exports are properly tracked in system.events.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True + ) + + with And("I read the initial logged number of part exports"): + initial_exports = get_export_events(node=self.context.node)#.get("PartsExports", 0) + note(f"Initial exports: {initial_exports}") + + # with When("I export parts to the S3 table"): + # export_parts( + # source_table="source", + # destination_table=s3_table_name, + # node=self.context.node, + # ) + + # with And("I read the final logged number of part exports"): + # final_exports = get_export_events(node=self.context.node).get("PartsExports", 0) + + # with Then("I check that the number of part exports is correct"): + + # with By("Reading the number of parts for the source table"): + # num_parts = len(get_parts(table_name="source", node=self.context.node)) + + # with And("Checking that the before and after difference is correct"): + # assert final_exports - initial_exports == num_parts, error() @TestFeature @@ -48,4 +53,4 @@ def duplicate_exports(self): def feature(self): """Check system monitoring of export events.""" - Scenario(run=duplicate_exports) + Scenario(run=part_exports) From 60d2767aa16a30728756569420cd4b94e4f9b2d2 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 28 Oct 2025 17:21:15 -0400 Subject: [PATCH 34/99] Basic concurrency test, move getting nodes to queries helper --- helpers/queries.py | 17 +++++++- s3/tests/export_part/clusters_and_nodes.py | 4 +- s3/tests/export_part/concurrency.py | 50 ++++++++++++++++++++++ s3/tests/export_part/datatypes.py | 1 - s3/tests/export_part/engines.py | 1 - s3/tests/export_part/error_handling.py | 5 +-- s3/tests/export_part/feature.py | 11 ++--- s3/tests/export_part/sanity.py | 8 ++-- s3/tests/export_part/steps.py | 16 +------ s3/tests/export_part/system_monitoring.py | 1 - 10 files changed, 80 insertions(+), 34 deletions(-) create mode 100644 s3/tests/export_part/concurrency.py diff --git a/helpers/queries.py b/helpers/queries.py index a8d065166..e0dbcf28d 100644 --- a/helpers/queries.py +++ b/helpers/queries.py @@ -16,11 +16,26 @@ # The extra [0] could be avoided with TSV format, but that does not guarantee valid JSON. +@TestStep(Given) +def get_cluster_nodes(self, cluster, node=None): + """Get all nodes in a cluster.""" + + if node is None: + node = self.context.node + + result = node.query( + f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", exitcode=0 + ) + + nodes = [line.strip() for line in result.output.splitlines() if line.strip()] + return nodes + + @TestStep(When) def select_all_ordered(self, table_name, node): """Select all data from a table ordered by partition and index columns.""" - return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output + return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output.splitlines() @TestStep(When) diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_and_nodes.py index 3d027bc64..5905f0392 100644 --- a/s3/tests/export_part/clusters_and_nodes.py +++ b/s3/tests/export_part/clusters_and_nodes.py @@ -16,13 +16,13 @@ def different_nodes_same_destination(self, cluster, node1, node2): partitioned_merge_tree_table( table_name="source", partition_by="p", - columns=default_columns(simple=False), + columns=default_columns(), stop_merges=True, populate=False, cluster=cluster, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, cluster=cluster, columns=default_columns(simple=False) + table_name="s3", create_new_bucket=True, cluster=cluster ) with And("I populate the source tables on both nodes"): diff --git a/s3/tests/export_part/concurrency.py b/s3/tests/export_part/concurrency.py new file mode 100644 index 000000000..0626c22d7 --- /dev/null +++ b/s3/tests/export_part/concurrency.py @@ -0,0 +1,50 @@ +from testflows.core import * +from s3.tests.export_part.steps import * +from helpers.create import * +from helpers.queries import * + + +@TestScenario +def basic_concurrent_export(self, threads): + """Check concurrent exports from different sources to the same S3 table.""" + + with Given(f"I create {threads} populated source tables and an empty S3 table"): + for i in range(threads): + partitioned_merge_tree_table( + table_name=f"source{i}", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True + ) + + with When("I export parts from all sources concurrently to the S3 table"): + for i in range(threads): + Step(test=export_parts, parallel=True)( + source_table=f"source{i}", + destination_table=s3_table_name, + node=self.context.node, + ) + join() + + with And("I read data from all tables"): + source_data = [] + for i in range(threads): + data = select_all_ordered(table_name=f"source{i}", node=self.context.node) + source_data.extend(data) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("All data should be present in the S3 table"): + assert set(source_data) == set(destination_data), error() + + +@TestFeature +@Name("concurrency") +def feature(self): + """Check that concurrent exports work correctly.""" + + Scenario(test=basic_concurrent_export)(threads=5) diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index 33c3893b2..d7442ab1d 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -12,7 +12,6 @@ def configured_table(self, partition_key_type): partition_by="p", columns=default_columns(partition_key_type=partition_key_type), stop_merges=True, - populate=True, ) s3_table_name = create_s3_table( table_name="s3", create_new_bucket=True, columns=default_columns(partition_key_type=partition_key_type) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index a58e71b08..12a958324 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -17,7 +17,6 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): table_name="source", partition_by="p", stop_merges=True, - populate=True, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, columns=default_columns(simple=False), diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 427aa1841..dbf32bee6 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -49,19 +49,18 @@ def duplicate_exports(self): partition_by="p", columns=default_columns(), stop_merges=True, - populate=True, ) s3_table_name = create_s3_table( table_name="s3", create_new_bucket=True ) with When("I try to export the parts twice"): - results1 = export_parts( + export_parts( source_table="source", destination_table=s3_table_name, node=self.context.node, ) - results2 = export_parts( + export_parts( source_table="source", destination_table=s3_table_name, node=self.context.node, diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 7b151663c..082d4eb5b 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,9 +12,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 0ba850e6c..0d1123a74 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -16,10 +16,9 @@ def mismatched_columns(self): partition_by="p", columns=default_columns(), stop_merges=True, - populate=True, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True + table_name="s3", create_new_bucket=True, columns=default_columns(simple=False) ) with When("I export parts to the S3 table"): @@ -49,7 +48,6 @@ def basic_table(self): partition_by="p", columns=default_columns(), stop_merges=True, - populate=True, ) s3_table_name = create_s3_table( table_name="s3", create_new_bucket=True @@ -104,8 +102,8 @@ def empty_table(self): ) with Then("They should be empty"): - assert source_data == "", error() - assert destination_data == "", error() + assert source_data == [], error() + assert destination_data == [], error() @TestFeature diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index f3ba8c1ac..f9e776fbc 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -4,6 +4,7 @@ from testflows.asserts import error from helpers.common import getuid from helpers.create import * +from helpers.queries import * from s3.tests.common import temporary_bucket_path @@ -74,21 +75,6 @@ def create_s3_table( return table_name -@TestStep(Given) -def get_cluster_nodes(self, cluster, node=None): - """Get all nodes in a cluster.""" - - if node is None: - node = self.context.node - - result = node.query( - f"SELECT host_name FROM system.clusters WHERE cluster = '{cluster}'", exitcode=0 - ) - - nodes = [line.strip() for line in result.output.splitlines() if line.strip()] - return nodes - - @TestStep(When) def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index c92954c37..930a15b21 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -19,7 +19,6 @@ def part_exports(self): partition_by="p", columns=default_columns(), stop_merges=True, - populate=True, ) s3_table_name = create_s3_table( table_name="s3", create_new_bucket=True From 10c5447445e9ccd212dc0f54d540a9637af1a237 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 28 Oct 2025 20:00:49 -0400 Subject: [PATCH 35/99] Broken setting test --- s3/tests/export_part/datatypes.py | 23 ++++++++++++++-- s3/tests/export_part/feature.py | 10 +++---- s3/tests/export_part/sanity.py | 45 +++++++++++++++++++++++++++++++ s3/tests/export_part/steps.py | 18 ++++++++----- 4 files changed, 83 insertions(+), 13 deletions(-) diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index d7442ab1d..b4899aa9c 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -4,8 +4,27 @@ from helpers.queries import * +# TODO + +# in common.py +# def create_int8() +# def create_int16() +# def create_UIint8() + +# in here +# def create_all_integer_types() + +# a reference: +# @TestStep(Given) +# def create_partitioned_table_with_fixed_string(self, ...): +# with By("creating a MergeTree table with FixedString datatype") +# create_table(table_name=table_name) +# with And(f"inserting data into a {table_name} table to create {number_of_parts} parts") +# create_partitions_for_fixedstring(table_name=table_name) + + @TestCheck -def configured_table(self, partition_key_type): +def simple_configured_table(self, partition_key_type): with Given(f"I create a populated source table with partition key type {partition_key_type} and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -55,7 +74,7 @@ def basic_partition_key_types(self): # "FixedString(1)", ] - configured_table(partition_key_type=either(*key_types)) + simple_configured_table(partition_key_type=either(*key_types)) @TestFeature diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 082d4eb5b..e5e2401ac 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -13,9 +13,9 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) - Feature(run=load("s3.tests.export_part.datatypes", "feature")) - Feature(run=load("s3.tests.export_part.concurrency", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 0d1123a74..03a34c59e 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -6,6 +6,50 @@ from s3.requirements.export_part import * +@TestScenario +def export_setting(self): + """Check that the export setting is settable in 2 ways when exporting parts.""" + + with Given("I create a populated source table and 2 empty S3 tables"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name1 = create_s3_table( + table_name="s3_1", create_new_bucket=True + ) + s3_table_name2 = create_s3_table( + table_name="s3_2" + ) + + with When("I export parts to the first S3 table using the SET query"): + export_parts( + source_table="source", + destination_table=s3_table_name1, + node=self.context.node, + explicit_set=True, + ) + + with And("I export parts to the second S3 table using the settings argument"): + export_parts( + source_table="source", + destination_table=s3_table_name2, + node=self.context.node, + explicit_set=False, + ) + + with And("I read data from all tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data1 = select_all_ordered(table_name=s3_table_name1, node=self.context.node) + destination_data2 = select_all_ordered(table_name=s3_table_name2, node=self.context.node) + + with Then("All tables should have the same data"): + assert source_data == destination_data1, error() + assert source_data == destination_data2, error() + + @TestScenario def mismatched_columns(self): """Test exporting parts when source and destination tables have mismatched columns.""" @@ -114,3 +158,4 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) Scenario(run=mismatched_columns) + # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index f9e776fbc..0211d3274 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -86,24 +86,30 @@ def get_parts(self, table_name, node): @TestStep(When) -def export_parts(self, source_table, destination_table, node, parts=None, exitcode=0): +def export_parts(self, source_table, destination_table, node, parts=None, exitcode=0, explicit_set=True): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" if parts is None: parts = get_parts(table_name=source_table, node=node) + no_checks = exitcode != 0 output = [] for part in parts: - output.append( - node.query( # we should be able to set the settings here instead of using the SET query, but this is a quick workaround for the bug + if explicit_set: + output.append(node.query( f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - # settings=[("allow_experimental_export_merge_tree_part", 1)], exitcode=exitcode, no_checks=no_checks, - ) - ) + )) + else: + output.append(node.query( + f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=exitcode, + no_checks=no_checks, + )) return output From c57b40dce7ee89ab63069098ce78f43c5e8fe217 Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 15:45:16 -0400 Subject: [PATCH 36/99] Partition key types tests --- helpers/queries.py | 4 +- s3/tests/export_part/concurrency.py | 4 +- s3/tests/export_part/datatypes.py | 91 ++++++++++++----------- s3/tests/export_part/engines.py | 4 +- s3/tests/export_part/error_handling.py | 14 ++-- s3/tests/export_part/feature.py | 4 +- s3/tests/export_part/sanity.py | 28 ++++--- s3/tests/export_part/steps.py | 60 +++++++++++---- s3/tests/export_part/system_monitoring.py | 10 +-- 9 files changed, 124 insertions(+), 95 deletions(-) diff --git a/helpers/queries.py b/helpers/queries.py index e0dbcf28d..12ac2b855 100644 --- a/helpers/queries.py +++ b/helpers/queries.py @@ -32,10 +32,10 @@ def get_cluster_nodes(self, cluster, node=None): @TestStep(When) -def select_all_ordered(self, table_name, node): +def select_all_ordered(self, table_name, node, order_by="p, i"): """Select all data from a table ordered by partition and index columns.""" - return node.query(f"SELECT * FROM {table_name} ORDER BY p, i", exitcode=0).output.splitlines() + return node.query(f"SELECT * FROM {table_name} ORDER BY {order_by}", exitcode=0).output.splitlines() @TestStep(When) diff --git a/s3/tests/export_part/concurrency.py b/s3/tests/export_part/concurrency.py index 0626c22d7..d22b3d01c 100644 --- a/s3/tests/export_part/concurrency.py +++ b/s3/tests/export_part/concurrency.py @@ -16,9 +16,7 @@ def basic_concurrent_export(self, threads): columns=default_columns(), stop_merges=True, ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts from all sources concurrently to the S3 table"): for i in range(threads): diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index b4899aa9c..d3f44d0e1 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -2,51 +2,69 @@ from s3.tests.export_part.steps import * from helpers.create import * from helpers.queries import * +from helpers.common import getuid -# TODO +@TestStep(Given) +def create_merge_tree_all_valid_partition_key_types( + self, column_name, cluster=None, node=None +): + """Create a MergeTree table with all valid partition key types.""" -# in common.py -# def create_int8() -# def create_int16() -# def create_UIint8() + if node is None: + node = self.context.node -# in here -# def create_all_integer_types() + with By("creating a MergeTree table with all data types"): + table_name = f"table_{getuid()}" + create_merge_tree_table( + table_name=table_name, + columns=valid_partition_key_types_columns(), + partition_by=column_name, + cluster=cluster, + stop_merges=True, + ) + + with And("I insert data into the table"): + for i in range(1): + node.query( + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) VALUES (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14')" + ) -# a reference: -# @TestStep(Given) -# def create_partitioned_table_with_fixed_string(self, ...): -# with By("creating a MergeTree table with FixedString datatype") -# create_table(table_name=table_name) -# with And(f"inserting data into a {table_name} table to create {number_of_parts} parts") -# create_partitions_for_fixedstring(table_name=table_name) + return table_name @TestCheck -def simple_configured_table(self, partition_key_type): - with Given(f"I create a populated source table with partition key type {partition_key_type} and empty S3 table"): - partitioned_merge_tree_table( - table_name="source", - partition_by="p", - columns=default_columns(partition_key_type=partition_key_type), - stop_merges=True, +def valid_partition_key_table(self, partition_key_type): + """Check exporting to a source table with specified valid partition key type.""" + + with Given( + f"I create a source table with valid partition key type {partition_key_type} and empty S3 table" + ): + table_name = create_merge_tree_all_valid_partition_key_types( + column_name=partition_key_type, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, columns=default_columns(partition_key_type=partition_key_type) + table_name="s3", + create_new_bucket=True, + columns=valid_partition_key_types_columns(), + partition_by=partition_key_type, ) with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=table_name, destination_table=s3_table_name, node=self.context.node, ) with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) + source_data = select_all_ordered( + table_name=table_name, node=self.context.node, order_by=partition_key_type + ) destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + table_name=s3_table_name, + node=self.context.node, + order_by=partition_key_type, ) with Then("They should be the same"): @@ -55,26 +73,11 @@ def simple_configured_table(self, partition_key_type): @TestSketch(Scenario) @Flags(TE) -def basic_partition_key_types(self): +def valid_partition_key_types(self): """Check that all partition key data types are supported when exporting parts.""" - key_types = [ - "Int8", - "Int16", - "Int32", - "Int64", - "UInt8", - "UInt16", - "UInt32", - "UInt64", - "Date", - "DateTime", - "DateTime64", - "String", - # "FixedString(1)", - ] - - simple_configured_table(partition_key_type=either(*key_types)) + key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] + valid_partition_key_table(partition_key_type=either(*key_types)) @TestFeature @@ -82,4 +85,4 @@ def basic_partition_key_types(self): def feature(self): """Check that all data types are supported when exporting parts.""" - Scenario(run=basic_partition_key_types) + Scenario(run=valid_partition_key_types) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 12a958324..ce175f7e9 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -22,7 +22,9 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): columns=default_columns(simple=False), ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, columns=default_columns(simple=False) + table_name="s3", + create_new_bucket=True, + columns=default_columns(simple=False), ) with When("I export parts to the S3 table"): diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index dbf32bee6..d041faac6 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -16,9 +16,7 @@ def invalid_part_name(self): stop_merges=True, populate=True, ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with And("I create an invalid part name"): invalid_part_name = "in_va_lid_part" @@ -50,9 +48,7 @@ def duplicate_exports(self): columns=default_columns(), stop_merges=True, ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I try to export the parts twice"): export_parts( @@ -65,10 +61,12 @@ def duplicate_exports(self): destination_table=s3_table_name, node=self.context.node, ) - + with Then("The source and destination tables should still be the same"): source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered(table_name=s3_table_name, node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) assert source_data == destination_data, error() diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index e5e2401ac..56b6ed4a1 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,10 +12,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) # Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 03a34c59e..83ff6c52c 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -17,12 +17,8 @@ def export_setting(self): columns=default_columns(), stop_merges=True, ) - s3_table_name1 = create_s3_table( - table_name="s3_1", create_new_bucket=True - ) - s3_table_name2 = create_s3_table( - table_name="s3_2" - ) + s3_table_name1 = create_s3_table(table_name="s3_1", create_new_bucket=True) + s3_table_name2 = create_s3_table(table_name="s3_2") with When("I export parts to the first S3 table using the SET query"): export_parts( @@ -42,8 +38,12 @@ def export_setting(self): with And("I read data from all tables"): source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data1 = select_all_ordered(table_name=s3_table_name1, node=self.context.node) - destination_data2 = select_all_ordered(table_name=s3_table_name2, node=self.context.node) + destination_data1 = select_all_ordered( + table_name=s3_table_name1, node=self.context.node + ) + destination_data2 = select_all_ordered( + table_name=s3_table_name2, node=self.context.node + ) with Then("All tables should have the same data"): assert source_data == destination_data1, error() @@ -62,7 +62,9 @@ def mismatched_columns(self): stop_merges=True, ) s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True, columns=default_columns(simple=False) + table_name="s3", + create_new_bucket=True, + columns=default_columns(simple=False), ) with When("I export parts to the S3 table"): @@ -93,9 +95,7 @@ def basic_table(self): columns=default_columns(), stop_merges=True, ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts to the S3 table"): export_parts( @@ -126,9 +126,7 @@ def empty_table(self): stop_merges=False, populate=False, ) - s3_table_name = create_s3_table( - table_name="empty_s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="empty_s3", create_new_bucket=True) with When("I export parts to the S3 table"): export_parts( diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 0211d3274..85c3f578f 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -1,5 +1,3 @@ -import json - from testflows.core import * from testflows.asserts import error from helpers.common import getuid @@ -24,6 +22,25 @@ def default_columns(simple=True, partition_key_type="Int8"): return columns +def valid_partition_key_types_columns(): + return [ + {"name": "int8", "type": "Int8"}, + {"name": "int16", "type": "Int16"}, + {"name": "int32", "type": "Int32"}, + {"name": "int64", "type": "Int64"}, + {"name": "uint8", "type": "UInt8"}, + {"name": "uint16", "type": "UInt16"}, + {"name": "uint32", "type": "UInt32"}, + {"name": "uint64", "type": "UInt64"}, + {"name": "date", "type": "Date"}, + {"name": "date32", "type": "Date32"}, + {"name": "datetime", "type": "DateTime"}, + {"name": "datetime64", "type": "DateTime64"}, + {"name": "string", "type": "String"}, + {"name": "fixedstring", "type": "FixedString(10)"}, + ] + + @TestStep(Given) def create_temp_bucket(self): """Create temporary S3 bucket.""" @@ -42,6 +59,7 @@ def create_s3_table( cluster=None, create_new_bucket=False, columns=None, + partition_by="p", ): """Create a destination S3 table.""" @@ -67,7 +85,7 @@ def create_s3_table( create_table( table_name=table_name, columns=columns, - partition_by="p", + partition_by=partition_by, engine=engine, cluster=cluster, ) @@ -86,7 +104,15 @@ def get_parts(self, table_name, node): @TestStep(When) -def export_parts(self, source_table, destination_table, node, parts=None, exitcode=0, explicit_set=True): +def export_parts( + self, + source_table, + destination_table, + node, + parts=None, + exitcode=0, + explicit_set=True, +): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" if parts is None: @@ -98,18 +124,22 @@ def export_parts(self, source_table, destination_table, node, parts=None, exitco for part in parts: if explicit_set: - output.append(node.query( - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - exitcode=exitcode, - no_checks=no_checks, - )) + output.append( + node.query( + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + ) + ) else: - output.append(node.query( - f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=exitcode, - no_checks=no_checks, - )) + output.append( + node.query( + f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=exitcode, + no_checks=no_checks, + ) + ) return output diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 930a15b21..8a59f2393 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -20,14 +20,14 @@ def part_exports(self): columns=default_columns(), stop_merges=True, ) - s3_table_name = create_s3_table( - table_name="s3", create_new_bucket=True - ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with And("I read the initial logged number of part exports"): - initial_exports = get_export_events(node=self.context.node)#.get("PartsExports", 0) + initial_exports = get_export_events( + node=self.context.node + ) # .get("PartsExports", 0) note(f"Initial exports: {initial_exports}") - + # with When("I export parts to the S3 table"): # export_parts( # source_table="source", From 44b4a26aafc9f23a956f89a48a8b2aa87eeb0861 Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 15:59:06 -0400 Subject: [PATCH 37/99] Add requirements --- s3/tests/export_part/concurrency.py | 2 ++ s3/tests/export_part/datatypes.py | 2 ++ s3/tests/export_part/error_handling.py | 2 ++ s3/tests/export_part/feature.py | 10 +++++----- s3/tests/export_part/sanity.py | 1 + s3/tests/export_part/system_monitoring.py | 6 ++++-- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/s3/tests/export_part/concurrency.py b/s3/tests/export_part/concurrency.py index d22b3d01c..dd1a2c1d2 100644 --- a/s3/tests/export_part/concurrency.py +++ b/s3/tests/export_part/concurrency.py @@ -2,6 +2,7 @@ from s3.tests.export_part.steps import * from helpers.create import * from helpers.queries import * +from s3.requirements.export_part import * @TestScenario @@ -41,6 +42,7 @@ def basic_concurrent_export(self, threads): @TestFeature +@Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) @Name("concurrency") def feature(self): """Check that concurrent exports work correctly.""" diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index d3f44d0e1..21b7999b3 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -3,6 +3,7 @@ from helpers.create import * from helpers.queries import * from helpers.common import getuid +from s3.requirements.export_part import * @TestStep(Given) @@ -73,6 +74,7 @@ def valid_partition_key_table(self, partition_key_type): @TestSketch(Scenario) @Flags(TE) +@Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) def valid_partition_key_types(self): """Check that all partition key data types are supported when exporting parts.""" diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index d041faac6..176dbf3ca 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -2,6 +2,7 @@ from testflows.asserts import error from s3.tests.export_part.steps import * from helpers.queries import * +from s3.requirements.export_part import * @TestScenario @@ -72,6 +73,7 @@ def duplicate_exports(self): @TestFeature @Name("error handling") +@Requirements(RQ_ClickHouse_ExportPart_FailureHandling("1.0")) def feature(self): """Check correct error handling when exporting parts.""" diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 56b6ed4a1..082d4eb5b 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,10 +12,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines", "feature")) + Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) Feature(run=load("s3.tests.export_part.datatypes", "feature")) - # Feature(run=load("s3.tests.export_part.concurrency", "feature")) + Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 83ff6c52c..eb7fd150a 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -51,6 +51,7 @@ def export_setting(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_SchemaCompatibility("1.0")) def mismatched_columns(self): """Test exporting parts when source and destination tables have mismatched columns.""" diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 8a59f2393..a178b1655 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -1,12 +1,13 @@ from testflows.core import * from testflows.asserts import error - from s3.tests.export_part.steps import * +from s3.requirements.export_part import * -# TODO checks on export_events should go here, not in sanity.py +# TODO checks on export_events should go here # partsexports incrementing correctly # duplicates incrementing correctly +# NOTE WIP @TestScenario @@ -49,6 +50,7 @@ def part_exports(self): @TestFeature @Name("system monitoring") +@Requirements(RQ_ClickHouse_ExportPart_Logging("1.0")) def feature(self): """Check system monitoring of export events.""" From cf7cb68d565dff5335741b387dd03e49dafbe18e Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 16:18:49 -0400 Subject: [PATCH 38/99] Export to same table error test --- s3/tests/export_part/error_handling.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 176dbf3ca..dccc23162 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -71,6 +71,31 @@ def duplicate_exports(self): assert source_data == destination_data, error() +@TestScenario +def same_table(self): + """Check exporting parts where source and destination tables are the same.""" + + with Given("I create a populated source table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + + with When("I try to export parts to itself"): + results = export_parts( + source_table="source", + destination_table="source", + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to same table exports"): + assert results[0].exitcode == 36, error() + assert "Exporting to the same table is not allowed" in results[0].output, error() + + @TestFeature @Name("error handling") @Requirements(RQ_ClickHouse_ExportPart_FailureHandling("1.0")) @@ -79,3 +104,4 @@ def feature(self): Scenario(run=invalid_part_name) Scenario(run=duplicate_exports) + Scenario(run=same_table) From 34256fa6c2df21b04721c1cb79f3defebad20c69 Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 16:28:40 -0400 Subject: [PATCH 39/99] Add req --- s3/tests/export_part/error_handling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index dccc23162..3a8afc29d 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -72,6 +72,7 @@ def duplicate_exports(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Restrictions_SameTable("1.0")) def same_table(self): """Check exporting parts where source and destination tables are the same.""" From bba8e90f017c9aa99a3371bd91b987a8b53e2cff Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 16:44:01 -0400 Subject: [PATCH 40/99] Local table export error test --- s3/tests/export_part/error_handling.py | 35 ++++++++++++++++++++++++++ s3/tests/export_part/feature.py | 8 +++--- s3/tests/export_part/sanity.py | 2 +- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 3a8afc29d..2b0c7092c 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -97,6 +97,40 @@ def same_table(self): assert "Exporting to the same table is not allowed" in results[0].output, error() +@TestScenario +def local_table(self): + """Test exporting parts to a local table.""" + + with Given("I create a populated source table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + + with And("I create an empty local table"): + partitioned_merge_tree_table( + table_name="destination", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=False, + ) + + with When("I export parts to the local table"): + results = export_parts( + source_table="source", + destination_table="destination", + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to local table exports"): + assert results[0].exitcode == 48, error() + assert "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" in results[0].output, error() + + @TestFeature @Name("error handling") @Requirements(RQ_ClickHouse_ExportPart_FailureHandling("1.0")) @@ -106,3 +140,4 @@ def feature(self): Scenario(run=invalid_part_name) Scenario(run=duplicate_exports) Scenario(run=same_table) + Scenario(run=local_table) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 082d4eb5b..725e500ba 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -15,7 +15,7 @@ def minio(self, uri, bucket_prefix): Feature(run=load("s3.tests.export_part.sanity", "feature")) Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) - Feature(run=load("s3.tests.export_part.datatypes", "feature")) - Feature(run=load("s3.tests.export_part.concurrency", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index eb7fd150a..2fc11a92e 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -157,4 +157,4 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) Scenario(run=mismatched_columns) - # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting + # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting \ No newline at end of file From 555dd3907169fa631f86e0dc5d4be343de6836e2 Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 29 Oct 2025 17:01:49 -0400 Subject: [PATCH 41/99] Update reqs, no partition type test --- s3/requirements/export_part.md | 11 +-------- s3/tests/export_part/error_handling.py | 3 +++ s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 32 ++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index 6891a2a47..7a6874867 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -130,7 +130,6 @@ version: 1.0 * `HDFS` - Hadoop Distributed File System (with Hive partitioning) * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) * `GCS` - Google Cloud Storage (with Hive partitioning) -* Implement the `supportsImport()` method and return `true` ## Destination setup and file management @@ -174,17 +173,9 @@ version: 1.0 | Partition Key Type | Supported | Examples | Notes | |-------------------|------------|----------|-------| | **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported | -| **Date/DateTime Types** | ✅ Yes | `Date`, `DateTime`, `DateTime64` | All date/time types supported | +| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported | | **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported | -| **Date Functions** | ✅ Yes | `toYYYYMM(date_col)`, `toMonday(date_col)`, `toYear(date_col)` | Result in supported types | -| **Mathematical Expressions** | ✅ Yes | `column1 + column2`, `column * 1000` | If result is supported type | -| **String Functions** | ✅ Yes | `substring(column, 1, 4)` | Result in String type | -| **Tuple Expressions** | ✅ Yes | `(toMonday(StartDate), EventType)` | If all elements are supported types | | **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported | -| **UUID Types** | ❌ No | `UUID` | Not supported by Hive partitioning | -| **Enum Types** | ❌ No | `Enum8`, `Enum16` | Not supported by Hive partitioning | -| **Floating-point Types** | ❌ No | `Float32`, `Float64` | Not supported by Hive partitioning | -| **Hash Functions** | ❌ No | `intHash32(column)`, `cityHash64(column)` | Result in unsupported types | [ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements. diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 2b0c7092c..3c5f4441d 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -131,6 +131,9 @@ def local_table(self): assert "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" in results[0].output, error() +# TODO different partition key + + @TestFeature @Name("error handling") @Requirements(RQ_ClickHouse_ExportPart_FailureHandling("1.0")) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 725e500ba..e5e2401ac 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -13,7 +13,7 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 2fc11a92e..c7b44d95e 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -149,6 +149,37 @@ def empty_table(self): assert destination_data == [], error() +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) +def no_partition_by(self): + """Test exporting parts when the source table has no PARTITION BY type.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="tuple()", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True, partition_by="tuple()") + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("They should be the same"): + assert source_data == destination_data, error() + + @TestFeature @Name("sanity") def feature(self): @@ -156,5 +187,6 @@ def feature(self): Scenario(run=empty_table) Scenario(run=basic_table) + Scenario(run=no_partition_by) Scenario(run=mismatched_columns) # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting \ No newline at end of file From a984005cc0c532afa9024b3419e638f528174d18 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 09:17:43 -0400 Subject: [PATCH 42/99] Refactor reqs --- s3/requirements/export_part.py | 22 ++-------------------- s3/tests/export_part/datatypes.py | 2 +- s3/tests/export_part/sanity.py | 3 +++ 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index 2b49a87e0..40789daed 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -113,7 +113,6 @@ " * `HDFS` - Hadoop Distributed File System (with Hive partitioning)\n" " * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning)\n" " * `GCS` - Google Cloud Storage (with Hive partitioning)\n" - "* Implement the `supportsImport()` method and return `true`\n" "\n" ), link=None, @@ -193,17 +192,9 @@ "| Partition Key Type | Supported | Examples | Notes |\n" "|-------------------|------------|----------|-------|\n" "| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported |\n" - "| **Date/DateTime Types** | ✅ Yes | `Date`, `DateTime`, `DateTime64` | All date/time types supported |\n" + "| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported |\n" "| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported |\n" - "| **Date Functions** | ✅ Yes | `toYYYYMM(date_col)`, `toMonday(date_col)`, `toYear(date_col)` | Result in supported types |\n" - "| **Mathematical Expressions** | ✅ Yes | `column1 + column2`, `column * 1000` | If result is supported type |\n" - "| **String Functions** | ✅ Yes | `substring(column, 1, 4)` | Result in String type |\n" - "| **Tuple Expressions** | ✅ Yes | `(toMonday(StartDate), EventType)` | If all elements are supported types |\n" "| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported |\n" - "| **UUID Types** | ❌ No | `UUID` | Not supported by Hive partitioning |\n" - "| **Enum Types** | ❌ No | `Enum8`, `Enum16` | Not supported by Hive partitioning |\n" - "| **Floating-point Types** | ❌ No | `Float32`, `Float64` | Not supported by Hive partitioning |\n" - "| **Hash Functions** | ❌ No | `intHash32(column)`, `cityHash64(column)` | Result in unsupported types |\n" "\n" "[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements.\n" "\n" @@ -895,7 +886,6 @@ * `HDFS` - Hadoop Distributed File System (with Hive partitioning) * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) * `GCS` - Google Cloud Storage (with Hive partitioning) -* Implement the `supportsImport()` method and return `true` ## Destination setup and file management @@ -939,17 +929,9 @@ | Partition Key Type | Supported | Examples | Notes | |-------------------|------------|----------|-------| | **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported | -| **Date/DateTime Types** | ✅ Yes | `Date`, `DateTime`, `DateTime64` | All date/time types supported | +| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported | | **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported | -| **Date Functions** | ✅ Yes | `toYYYYMM(date_col)`, `toMonday(date_col)`, `toYear(date_col)` | Result in supported types | -| **Mathematical Expressions** | ✅ Yes | `column1 + column2`, `column * 1000` | If result is supported type | -| **String Functions** | ✅ Yes | `substring(column, 1, 4)` | Result in String type | -| **Tuple Expressions** | ✅ Yes | `(toMonday(StartDate), EventType)` | If all elements are supported types | | **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported | -| **UUID Types** | ❌ No | `UUID` | Not supported by Hive partitioning | -| **Enum Types** | ❌ No | `Enum8`, `Enum16` | Not supported by Hive partitioning | -| **Floating-point Types** | ❌ No | `Float32`, `Float64` | Not supported by Hive partitioning | -| **Hash Functions** | ❌ No | `intHash32(column)`, `cityHash64(column)` | Result in unsupported types | [ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements. diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index 21b7999b3..094e89d49 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -26,7 +26,7 @@ def create_merge_tree_all_valid_partition_key_types( ) with And("I insert data into the table"): - for i in range(1): + for i in range(10): node.query( f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) VALUES (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14')" ) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index c7b44d95e..242366069 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -180,6 +180,9 @@ def no_partition_by(self): assert source_data == destination_data, error() +# TODO wildcard partition strategy? + + @TestFeature @Name("sanity") def feature(self): From bb44c1e9cc6845e6c510d6535dece811205e701f Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 30 Oct 2025 19:03:19 +0400 Subject: [PATCH 43/99] push usage of netem to s3 --- s3/regression.py | 5 +++++ s3/s3_env/clickhouse-service.yml | 2 ++ s3/s3_env_arm64/clickhouse-service.yml | 2 ++ 3 files changed, 9 insertions(+) mode change 100644 => 100755 s3/s3_env/clickhouse-service.yml mode change 100644 => 100755 s3/s3_env_arm64/clickhouse-service.yml diff --git a/s3/regression.py b/s3/regression.py index cc4fe9500..ff65e6041 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -541,6 +541,7 @@ def minio_regression( self.context.node = cluster.node("clickhouse1") self.context.node2 = cluster.node("clickhouse2") self.context.node3 = cluster.node("clickhouse3") + self.context.nodes = [self.context.node, self.context.node2, self.context.node3] with And("I have a minio client"): start_minio(access_key=root_user, secret_key=root_password) @@ -551,6 +552,10 @@ def minio_regression( for node in nodes["clickhouse"]: experimental_analyzer(node=cluster.node(node), with_analyzer=with_analyzer) + with And("I install tc-netem on all clickhouse nodes"): + for node in self.context.nodes: + node.command("apt install --yes iproute2 procps") + # with And("allow higher cpu_wait_ratio "): # if check_clickhouse_version(">=25.4")(self): # allow_higher_cpu_wait_ratio( diff --git a/s3/s3_env/clickhouse-service.yml b/s3/s3_env/clickhouse-service.yml old mode 100644 new mode 100755 index f3cc6cc7d..c766d2085 --- a/s3/s3_env/clickhouse-service.yml +++ b/s3/s3_env/clickhouse-service.yml @@ -22,3 +22,5 @@ services: - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/system_unfreeze.xml:/etc/clickhouse-server/config.d/system_unfreeze.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/ssl:/etc/clickhouse-server/ssl" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/users.d/output_format_json_quote_64bit_integers.xml:/etc/clickhouse-server/users.d/output_format_json_quote_64bit_integers.xml" + cap_add: + - NET_ADMIN \ No newline at end of file diff --git a/s3/s3_env_arm64/clickhouse-service.yml b/s3/s3_env_arm64/clickhouse-service.yml old mode 100644 new mode 100755 index f1feb9ad8..f5f2dfeef --- a/s3/s3_env_arm64/clickhouse-service.yml +++ b/s3/s3_env_arm64/clickhouse-service.yml @@ -22,3 +22,5 @@ services: - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/system_unfreeze.xml:/etc/clickhouse-server/config.d/system_unfreeze.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/ssl:/etc/clickhouse-server/ssl" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/users.d/output_format_json_quote_64bit_integers.xml:/etc/clickhouse-server/users.d/output_format_json_quote_64bit_integers.xml" + cap_add: + - NET_ADMIN \ No newline at end of file From e1444da52e0ba669549eee80229887128e4551a9 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 11:29:34 -0400 Subject: [PATCH 44/99] Wide compact parts, fixes --- helpers/create.py | 6 ++++ helpers/tables.py | 22 ++++++++++---- s3/requirements/export_part.md | 4 --- s3/requirements/export_part.py | 8 ----- s3/tests/export_part/datatypes.py | 41 ++++++++++++++++++-------- s3/tests/export_part/error_handling.py | 15 ++++++---- s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 35 ++++++++++++++++++++-- s3/tests/export_part/steps.py | 2 +- 9 files changed, 95 insertions(+), 40 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index b4cfe605c..4962a8859 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -25,6 +25,7 @@ def create_table( settings=None, partition_by=None, stop_merges=False, + query_settings=None, ): """ Generates a query to create a table in ClickHouse. @@ -109,6 +110,9 @@ def create_table( if comment: query += f" COMMENT '{comment}'" + if query_settings: + query += f" SETTINGS {query_settings}" + query += ";" if stop_merges: @@ -140,6 +144,7 @@ def create_merge_tree_table( partition_by: str = None, cluster: str = None, stop_merges: bool = False, + query_settings: str = None, ): """Create a table with the MergeTree engine.""" create_table( @@ -154,6 +159,7 @@ def create_merge_tree_table( partition_by=partition_by, cluster=cluster, stop_merges=stop_merges, + query_settings=query_settings, ) diff --git a/helpers/tables.py b/helpers/tables.py index 94c6a9582..70189f7d1 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -423,6 +423,7 @@ def create_table( node=None, cluster=None, order_by_all_columns=False, + stop_merges=False, ): """Create a table with specified name and engine.""" if settings is None: @@ -480,6 +481,11 @@ def create_table( if query_settings is not None: query += f"\nSETTINGS {query_settings}" + query += ";" + + if stop_merges: + query += f" SYSTEM STOP MERGES {name};" + node.query( query, settings=settings, @@ -564,19 +570,25 @@ def create_partitioned_table_with_compact_and_wide_parts( min_rows_for_wide_part=10, min_bytes_for_wide_part=100, engine="MergeTree", + columns=[ + Column(name="p", datatype=UInt8()), + Column(name="i", datatype=UInt64()), + ], + partition_by="p", + cluster=None, + stop_merges=False, ): """Create a partitioned table that has specific settings in order to get both wide and compact parts.""" create_table( name=table_name, engine=engine, - partition_by="p", + partition_by=partition_by, order_by="tuple()", - columns=[ - Column(name="p", datatype=UInt8()), - Column(name="i", datatype=UInt64()), - ], + columns=columns, + cluster=cluster, query_settings=f"min_rows_for_wide_part={min_rows_for_wide_part}, min_bytes_for_wide_part={min_bytes_for_wide_part}", + stop_merges=stop_merges, ) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index 7a6874867..218003e40 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -192,10 +192,6 @@ version: 1.0 |-----------|------------|-------------|------------------| | **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | | **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | -| **Regular Parts** | ✅ Yes | Standard data parts created by inserts, merges, mutations | Full data content | -| **Patch Parts** | ✅ Yes | Lightweight update parts containing only changed columns | Applied during export | -| **Active Parts** | ✅ Yes | Currently active data parts | Primary export target | -| **Outdated Parts** | ✅ Yes | Parts that have been replaced by newer versions | Can be exported for backup | [ClickHouse] SHALL handle all special columns and metadata present in parts during export: diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index 40789daed..722432bfc 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -220,10 +220,6 @@ "|-----------|------------|-------------|------------------|\n" "| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts |\n" "| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts |\n" - "| **Regular Parts** | ✅ Yes | Standard data parts created by inserts, merges, mutations | Full data content |\n" - "| **Patch Parts** | ✅ Yes | Lightweight update parts containing only changed columns | Applied during export |\n" - "| **Active Parts** | ✅ Yes | Currently active data parts | Primary export target |\n" - "| **Outdated Parts** | ✅ Yes | Parts that have been replaced by newer versions | Can be exported for backup |\n" "\n" "[ClickHouse] SHALL handle all special columns and metadata present in parts during export:\n" "\n" @@ -948,10 +944,6 @@ |-----------|------------|-------------|------------------| | **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | | **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | -| **Regular Parts** | ✅ Yes | Standard data parts created by inserts, merges, mutations | Full data content | -| **Patch Parts** | ✅ Yes | Lightweight update parts containing only changed columns | Applied during export | -| **Active Parts** | ✅ Yes | Currently active data parts | Primary export target | -| **Outdated Parts** | ✅ Yes | Parts that have been replaced by newer versions | Can be exported for backup | [ClickHouse] SHALL handle all special columns and metadata present in parts during export: diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index 094e89d49..626f1f895 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -8,9 +8,9 @@ @TestStep(Given) def create_merge_tree_all_valid_partition_key_types( - self, column_name, cluster=None, node=None + self, column_name, cluster=None, node=None, rows=1 ): - """Create a MergeTree table with all valid partition key types.""" + """Create a MergeTree table with all valid partition key types and both wide and compact parts.""" if node is None: node = self.context.node @@ -23,26 +23,27 @@ def create_merge_tree_all_valid_partition_key_types( partition_by=column_name, cluster=cluster, stop_merges=True, + query_settings=f"min_rows_for_wide_part=10", ) - with And("I insert data into the table"): - for i in range(10): - node.query( - f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) VALUES (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14')" - ) + with And("I insert compact and wide parts into the table"): + node.query( + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" + ) return table_name @TestCheck -def valid_partition_key_table(self, partition_key_type): - """Check exporting to a source table with specified valid partition key type.""" +def valid_partition_key_table(self, partition_key_type, rows=1): + """Check exporting to a source table with specified valid partition key type and rows.""" with Given( f"I create a source table with valid partition key type {partition_key_type} and empty S3 table" ): table_name = create_merge_tree_all_valid_partition_key_types( column_name=partition_key_type, + rows=rows, ) s3_table_name = create_s3_table( table_name="s3", @@ -75,16 +76,30 @@ def valid_partition_key_table(self, partition_key_type): @TestSketch(Scenario) @Flags(TE) @Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) -def valid_partition_key_types(self): - """Check that all partition key data types are supported when exporting parts.""" +def valid_partition_key_types_compact(self): + """Check that all partition key data types are supported when exporting compact parts.""" + + key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] + valid_partition_key_table(partition_key_type=either(*key_types), rows=1) + + +@TestSketch(Scenario) +@Flags(TE) +def valid_partition_key_types_wide(self): + """Check that all partition key data types are supported when exporting wide parts.""" key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] - valid_partition_key_table(partition_key_type=either(*key_types)) + valid_partition_key_table(partition_key_type=either(*key_types), rows=100) @TestFeature @Name("datatypes") +@Requirements( + RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0"), + RQ_ClickHouse_ExportPart_PartTypes("1.0"), +) def feature(self): """Check that all data types are supported when exporting parts.""" - Scenario(run=valid_partition_key_types) + Scenario(run=valid_partition_key_types_compact) + Scenario(run=valid_partition_key_types_wide) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 3c5f4441d..0d6bcb3bd 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -94,13 +94,15 @@ def same_table(self): with Then("I should see an error related to same table exports"): assert results[0].exitcode == 36, error() - assert "Exporting to the same table is not allowed" in results[0].output, error() + assert ( + "Exporting to the same table is not allowed" in results[0].output + ), error() @TestScenario def local_table(self): """Test exporting parts to a local table.""" - + with Given("I create a populated source table"): partitioned_merge_tree_table( table_name="source", @@ -108,7 +110,7 @@ def local_table(self): columns=default_columns(), stop_merges=True, ) - + with And("I create an empty local table"): partitioned_merge_tree_table( table_name="destination", @@ -128,7 +130,10 @@ def local_table(self): with Then("I should see an error related to local table exports"): assert results[0].exitcode == 48, error() - assert "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" in results[0].output, error() + assert ( + "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" + in results[0].output + ), error() # TODO different partition key @@ -143,4 +148,4 @@ def feature(self): Scenario(run=invalid_part_name) Scenario(run=duplicate_exports) Scenario(run=same_table) - Scenario(run=local_table) \ No newline at end of file + Scenario(run=local_table) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index e5e2401ac..b8acac116 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -17,5 +17,5 @@ def minio(self, uri, bucket_prefix): # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) # Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 242366069..7ad2fe08a 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -4,6 +4,7 @@ from helpers.create import * from helpers.queries import * from s3.requirements.export_part import * +from alter.table.replace_partition.partition_types import * @TestScenario @@ -153,7 +154,7 @@ def empty_table(self): @Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) def no_partition_by(self): """Test exporting parts when the source table has no PARTITION BY type.""" - + with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -161,7 +162,34 @@ def no_partition_by(self): columns=default_columns(), stop_merges=True, ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True, partition_by="tuple()") + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, partition_by="tuple()" + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def wide_and_compact_parts(self): + """Check that exporting with both wide and compact parts is supported.""" + + with Given("I create a source table with wide and compact parts"): + table_with_compact_and_wide_parts(table_name="source") + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts to the S3 table"): export_parts( @@ -192,4 +220,5 @@ def feature(self): Scenario(run=basic_table) Scenario(run=no_partition_by) Scenario(run=mismatched_columns) - # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting \ No newline at end of file + Scenario(run=wide_and_compact_parts) + # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 85c3f578f..d88af2418 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -6,7 +6,7 @@ from s3.tests.common import temporary_bucket_path -def default_columns(simple=True, partition_key_type="Int8"): +def default_columns(simple=True, partition_key_type="UInt8"): columns = [ {"name": "p", "type": partition_key_type}, {"name": "i", "type": "UInt64"}, From ed16c9f14d48de1b9b6970ebebd62462f6c0574e Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 11:40:39 -0400 Subject: [PATCH 45/99] Add reqs --- s3/tests/export_part/sanity.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 7ad2fe08a..e5ade6f4c 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -8,6 +8,7 @@ @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Settings_AllowExperimental("1.0")) def export_setting(self): """Check that the export setting is settable in 2 ways when exporting parts.""" @@ -184,6 +185,7 @@ def no_partition_by(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_PartTypes("1.0")) def wide_and_compact_parts(self): """Check that exporting with both wide and compact parts is supported.""" @@ -208,9 +210,6 @@ def wide_and_compact_parts(self): assert source_data == destination_data, error() -# TODO wildcard partition strategy? - - @TestFeature @Name("sanity") def feature(self): From 61b6f3ee7177bcf9f76e0fd0f190622ddb8ce3c5 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 12:13:29 -0400 Subject: [PATCH 46/99] Export setting test, black --- helpers/queries.py | 4 +++- helpers/tables.py | 6 ++--- s3/tests/export_part/error_handling.py | 28 +++++++++++++++++++++++ s3/tests/export_part/feature.py | 6 ++--- s3/tests/export_part/sanity.py | 4 ++-- s3/tests/export_part/steps.py | 14 +++++++++--- s3/tests/export_part/system_monitoring.py | 3 ++- 7 files changed, 52 insertions(+), 13 deletions(-) diff --git a/helpers/queries.py b/helpers/queries.py index 12ac2b855..42f0835f9 100644 --- a/helpers/queries.py +++ b/helpers/queries.py @@ -35,7 +35,9 @@ def get_cluster_nodes(self, cluster, node=None): def select_all_ordered(self, table_name, node, order_by="p, i"): """Select all data from a table ordered by partition and index columns.""" - return node.query(f"SELECT * FROM {table_name} ORDER BY {order_by}", exitcode=0).output.splitlines() + return node.query( + f"SELECT * FROM {table_name} ORDER BY {order_by}", exitcode=0 + ).output.splitlines() @TestStep(When) diff --git a/helpers/tables.py b/helpers/tables.py index 70189f7d1..0ad5fc477 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -571,9 +571,9 @@ def create_partitioned_table_with_compact_and_wide_parts( min_bytes_for_wide_part=100, engine="MergeTree", columns=[ - Column(name="p", datatype=UInt8()), - Column(name="i", datatype=UInt64()), - ], + Column(name="p", datatype=UInt8()), + Column(name="i", datatype=UInt64()), + ], partition_by="p", cluster=None, stop_merges=False, diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 0d6bcb3bd..620369b0a 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -136,6 +136,33 @@ def local_table(self): ), error() +@TestScenario +def disable_export_setting(self): + """Check that exporting parts without the export setting set returns the correct error.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts with the export setting disabled"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + explicit_set=-1, + ) + + with Then("I should see an error related to the export setting"): + assert results[0].exitcode == 88, error() + assert "Exporting merge tree part is experimental" in results[0].output, error() + + # TODO different partition key @@ -149,3 +176,4 @@ def feature(self): Scenario(run=duplicate_exports) Scenario(run=same_table) Scenario(run=local_table) + Scenario(run=disable_export_setting) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index b8acac116..daa08bfb2 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,10 +12,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) - Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) # Feature(run=load("s3.tests.export_part.concurrency", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index e5ade6f4c..fef421272 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -27,7 +27,7 @@ def export_setting(self): source_table="source", destination_table=s3_table_name1, node=self.context.node, - explicit_set=True, + explicit_set=1, ) with And("I export parts to the second S3 table using the settings argument"): @@ -35,7 +35,7 @@ def export_setting(self): source_table="source", destination_table=s3_table_name2, node=self.context.node, - explicit_set=False, + explicit_set=0, ) with And("I read data from all tables"): diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index d88af2418..f4878353a 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -111,7 +111,7 @@ def export_parts( node, parts=None, exitcode=0, - explicit_set=True, + explicit_set=1, ): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" @@ -123,7 +123,7 @@ def export_parts( output = [] for part in parts: - if explicit_set: + if explicit_set == 1: output.append( node.query( f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", @@ -131,7 +131,7 @@ def export_parts( no_checks=no_checks, ) ) - else: + elif explicit_set == 0: output.append( node.query( f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", @@ -140,6 +140,14 @@ def export_parts( no_checks=no_checks, ) ) + elif explicit_set == -1: + output.append( + node.query( + f"SET allow_experimental_export_merge_tree_part = 0; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + ) + ) return output diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index a178b1655..c95efe148 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -4,9 +4,10 @@ from s3.requirements.export_part import * -# TODO checks on export_events should go here +# TODO checks on export events should go here # partsexports incrementing correctly # duplicates incrementing correctly +# part_log is where to look # NOTE WIP From 65baf31bd803f81a65be19f584215987c29ba760 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 12:29:35 -0400 Subject: [PATCH 47/99] Rename files, prep for network tests --- .../{clusters_and_nodes.py => clusters_nodes.py} | 0 .../{concurrency.py => concurrency_networks.py} | 11 +++++++++-- s3/tests/export_part/feature.py | 6 +++--- 3 files changed, 12 insertions(+), 5 deletions(-) rename s3/tests/export_part/{clusters_and_nodes.py => clusters_nodes.py} (100%) rename s3/tests/export_part/{concurrency.py => concurrency_networks.py} (72%) diff --git a/s3/tests/export_part/clusters_and_nodes.py b/s3/tests/export_part/clusters_nodes.py similarity index 100% rename from s3/tests/export_part/clusters_and_nodes.py rename to s3/tests/export_part/clusters_nodes.py diff --git a/s3/tests/export_part/concurrency.py b/s3/tests/export_part/concurrency_networks.py similarity index 72% rename from s3/tests/export_part/concurrency.py rename to s3/tests/export_part/concurrency_networks.py index dd1a2c1d2..0bd90ac4f 100644 --- a/s3/tests/export_part/concurrency.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -43,8 +43,15 @@ def basic_concurrent_export(self, threads): @TestFeature @Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) -@Name("concurrency") +@Name("concurrency and networks") def feature(self): - """Check that concurrent exports work correctly.""" + """Check that exports work correctly with concurrency and various network conditions.""" Scenario(test=basic_concurrent_export)(threads=5) + # Scenario(test=network_packet_delay)(delay_ms=100) + # Scenario(test=network_packet_loss)(percent_loss=50) + # Scenario(test=network_packet_loss_gemodel)(interruption_probability=10, recovery_probability=90) + # Scenario(test=network_packet_corruption)(percent_corrupt=20) + # Scenario(test=network_packet_duplication)(percent_duplicated=10) + # Scenario(test=network_packet_reordering)(delay_ms=100, percent_reordered=90) + # Scenario(test=network_packet_rate_limit)(rate_mbit=10) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index daa08bfb2..6754e09c4 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -13,9 +13,9 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix # Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_and_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) # Feature(run=load("s3.tests.export_part.datatypes", "feature")) - # Feature(run=load("s3.tests.export_part.concurrency", "feature")) + Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) From 343e37372d63b2b06ad21ccbe81081fee0b3558b Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 30 Oct 2025 16:47:01 -0400 Subject: [PATCH 48/99] Network jigglin --- s3/tests/export_part/concurrency_networks.py | 260 ++++++++++++++++++- s3/tests/export_part/system_monitoring.py | 4 +- 2 files changed, 255 insertions(+), 9 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 0bd90ac4f..dc641bb07 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -3,6 +3,7 @@ from helpers.create import * from helpers.queries import * from s3.requirements.export_part import * +from alter.stress.tests.tc_netem import * @TestScenario @@ -41,6 +42,251 @@ def basic_concurrent_export(self, threads): assert set(source_data) == set(destination_data), error() +@TestScenario +def packet_delay(self, delay_ms): + """Check that exports work correctly with packet delay.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply a packet delay"): + network_packet_delay(node=self.context.node, delay_ms=delay_ms) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_loss(self, percent_loss): + """Check that exports work correctly with packet loss.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet loss"): + network_packet_loss(node=self.context.node, percent_loss=percent_loss) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_loss_gemodel(self, interruption_probability, recovery_probability): + """Check that exports work correctly with packet loss using the GE model.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet loss using the GE model"): + network_packet_loss_gemodel(node=self.context.node, interruption_probability=interruption_probability, recovery_probability=recovery_probability) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_corruption(self, percent_corrupt): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet corruption"): + network_packet_corruption(node=self.context.node, percent_corrupt=percent_corrupt) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_duplication(self, percent_duplicated): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet duplication"): + network_packet_duplication(node=self.context.node, percent_duplicated=percent_duplicated) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_reordering(self, delay_ms, percent_reordered): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet reordering"): + network_packet_reordering(node=self.context.node, delay_ms=delay_ms, percent_reordered=percent_reordered) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + +@TestScenario +def packet_rate_limit(self, rate_mbit): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet rate limit"): + network_packet_rate_limit(node=self.context.node, rate_mbit=rate_mbit) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("I read data from both tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with And("They should be the same"): + assert source_data == destination_data, error() + + @TestFeature @Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) @Name("concurrency and networks") @@ -48,10 +294,10 @@ def feature(self): """Check that exports work correctly with concurrency and various network conditions.""" Scenario(test=basic_concurrent_export)(threads=5) - # Scenario(test=network_packet_delay)(delay_ms=100) - # Scenario(test=network_packet_loss)(percent_loss=50) - # Scenario(test=network_packet_loss_gemodel)(interruption_probability=10, recovery_probability=90) - # Scenario(test=network_packet_corruption)(percent_corrupt=20) - # Scenario(test=network_packet_duplication)(percent_duplicated=10) - # Scenario(test=network_packet_reordering)(delay_ms=100, percent_reordered=90) - # Scenario(test=network_packet_rate_limit)(rate_mbit=10) \ No newline at end of file + Scenario(test=packet_delay)(delay_ms=100) + Scenario(test=packet_loss)(percent_loss=50) + Scenario(test=packet_loss_gemodel)(interruption_probability=40, recovery_probability=70) + Scenario(test=packet_corruption)(percent_corrupt=50) + Scenario(test=packet_duplication)(percent_duplicated=50) # How do I make this fail? + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # And this? + Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Am I using this right lol \ No newline at end of file diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index c95efe148..32dfaa5d2 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -13,7 +13,7 @@ @TestScenario def part_exports(self): - """Check part exports are properly tracked in system.events.""" + """Check part exports are properly tracked in system.part_log.""" with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( @@ -55,4 +55,4 @@ def part_exports(self): def feature(self): """Check system monitoring of export events.""" - Scenario(run=part_exports) + Scenario(run=part_exports) \ No newline at end of file From f52afb49183ce0658a719061c7e8f7053122e9f9 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 31 Oct 2025 15:17:08 -0400 Subject: [PATCH 49/99] Different partition key error test --- s3/tests/export_part/error_handling.py | 27 ++++++++++++++++++++++- s3/tests/export_part/feature.py | 4 ++-- s3/tests/export_part/system_monitoring.py | 12 ++++++---- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 620369b0a..c117fdb0d 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -137,6 +137,7 @@ def local_table(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Settings_AllowExperimental("1.0")) def disable_export_setting(self): """Check that exporting parts without the export setting set returns the correct error.""" @@ -163,7 +164,30 @@ def disable_export_setting(self): assert "Exporting merge tree part is experimental" in results[0].output, error() -# TODO different partition key +@TestScenario +def different_partition_key(self): + """Check exporting parts with a different partition key returns the correct error.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="i", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to the different partition key"): + assert results[0].exitcode == 36, error() + assert "Tables have different partition key" in results[0].output, error() @TestFeature @@ -177,3 +201,4 @@ def feature(self): Scenario(run=same_table) Scenario(run=local_table) Scenario(run=disable_export_setting) + Scenario(run=different_partition_key) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 6754e09c4..c02215fdb 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -13,9 +13,9 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) # Feature(run=load("s3.tests.export_part.datatypes", "feature")) - Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 32dfaa5d2..6f67bef1a 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -4,11 +4,15 @@ from s3.requirements.export_part import * -# TODO checks on export events should go here -# partsexports incrementing correctly -# duplicates incrementing correctly +# TODO # part_log is where to look -# NOTE WIP +# overwrite file +# max bandwidth + # some of system.events stuff wont appear unless i set this maybe? just a guess +# system.events + # Export row in system.metrics?? + # partsexports incrementing correctly + # duplicates incrementing correctly @TestScenario From f5acef1a58488450d19b9c7242841d83614edc9a Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 31 Oct 2025 15:34:32 -0400 Subject: [PATCH 50/99] Reuse source matches destination step --- s3/tests/export_part/concurrency_networks.py | 109 +++++++++---------- s3/tests/export_part/engines.py | 4 +- s3/tests/export_part/error_handling.py | 9 +- s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 33 ++---- s3/tests/export_part/steps.py | 13 ++- s3/tests/export_part/system_monitoring.py | 10 +- 7 files changed, 85 insertions(+), 95 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index dc641bb07..b4e8a97e2 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -67,20 +67,17 @@ def packet_delay(self, delay_ms): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_loss(self, percent_loss): """Check that exports work correctly with packet loss.""" - + with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -102,15 +99,12 @@ def packet_loss(self, percent_loss): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_loss_gemodel(self, interruption_probability, recovery_probability): @@ -126,7 +120,11 @@ def packet_loss_gemodel(self, interruption_probability, recovery_probability): s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I apply packet loss using the GE model"): - network_packet_loss_gemodel(node=self.context.node, interruption_probability=interruption_probability, recovery_probability=recovery_probability) + network_packet_loss_gemodel( + node=self.context.node, + interruption_probability=interruption_probability, + recovery_probability=recovery_probability, + ) with And("I export parts from the source table to the S3 table"): export_parts( @@ -137,15 +135,12 @@ def packet_loss_gemodel(self, interruption_probability, recovery_probability): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_corruption(self, percent_corrupt): @@ -161,7 +156,9 @@ def packet_corruption(self, percent_corrupt): s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I apply packet corruption"): - network_packet_corruption(node=self.context.node, percent_corrupt=percent_corrupt) + network_packet_corruption( + node=self.context.node, percent_corrupt=percent_corrupt + ) with And("I export parts from the source table to the S3 table"): export_parts( @@ -172,15 +169,12 @@ def packet_corruption(self, percent_corrupt): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_duplication(self, percent_duplicated): @@ -196,7 +190,9 @@ def packet_duplication(self, percent_duplicated): s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I apply packet duplication"): - network_packet_duplication(node=self.context.node, percent_duplicated=percent_duplicated) + network_packet_duplication( + node=self.context.node, percent_duplicated=percent_duplicated + ) with And("I export parts from the source table to the S3 table"): export_parts( @@ -207,15 +203,12 @@ def packet_duplication(self, percent_duplicated): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_reordering(self, delay_ms, percent_reordered): @@ -231,7 +224,11 @@ def packet_reordering(self, delay_ms, percent_reordered): s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I apply packet reordering"): - network_packet_reordering(node=self.context.node, delay_ms=delay_ms, percent_reordered=percent_reordered) + network_packet_reordering( + node=self.context.node, + delay_ms=delay_ms, + percent_reordered=percent_reordered, + ) with And("I export parts from the source table to the S3 table"): export_parts( @@ -242,15 +239,12 @@ def packet_reordering(self, delay_ms, percent_reordered): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def packet_rate_limit(self, rate_mbit): @@ -277,15 +271,12 @@ def packet_rate_limit(self, rate_mbit): for retry in retries(timeout=30, delay=1): with retry: - with Then("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with And("They should be the same"): - assert source_data == destination_data, error() - @TestFeature @Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) @@ -296,8 +287,10 @@ def feature(self): Scenario(test=basic_concurrent_export)(threads=5) Scenario(test=packet_delay)(delay_ms=100) Scenario(test=packet_loss)(percent_loss=50) - Scenario(test=packet_loss_gemodel)(interruption_probability=40, recovery_probability=70) + Scenario(test=packet_loss_gemodel)( + interruption_probability=40, recovery_probability=70 + ) Scenario(test=packet_corruption)(percent_corrupt=50) - Scenario(test=packet_duplication)(percent_duplicated=50) # How do I make this fail? - Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # And this? - Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Am I using this right lol \ No newline at end of file + Scenario(test=packet_duplication)(percent_duplicated=50) # How do I make this fail? + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # And this? + Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Am I using this right lol diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index ce175f7e9..43dea55e4 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -19,12 +19,12 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): stop_merges=True, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, - columns=default_columns(simple=False), + columns=default_columns(simple=False, partition_key_type="Int8"), ) s3_table_name = create_s3_table( table_name="s3", create_new_bucket=True, - columns=default_columns(simple=False), + columns=default_columns(simple=False, partition_key_type="Int8"), ) with When("I export parts to the S3 table"): diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index c117fdb0d..09cc4f421 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -63,12 +63,11 @@ def duplicate_exports(self): node=self.context.node, ) - with Then("The source and destination tables should still be the same"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - assert source_data == destination_data, error() @TestScenario diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index c02215fdb..c6e4d906c 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -13,7 +13,7 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix # Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index fef421272..11e900572 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -107,15 +107,12 @@ def basic_table(self): node=self.context.node, ) - with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with Then("They should be the same"): - assert source_data == destination_data, error() - @TestScenario def empty_table(self): @@ -174,15 +171,12 @@ def no_partition_by(self): node=self.context.node, ) - with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with Then("They should be the same"): - assert source_data == destination_data, error() - @TestScenario @Requirements(RQ_ClickHouse_ExportPart_PartTypes("1.0")) @@ -200,15 +194,12 @@ def wide_and_compact_parts(self): node=self.context.node, ) - with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with Then("They should be the same"): - assert source_data == destination_data, error() - @TestFeature @Name("sanity") diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index f4878353a..8df8b65d3 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -168,10 +168,17 @@ def get_export_events(self, node): @TestStep(Then) def source_matches_destination( - self, source_table, destination_table, source_node, destination_node + self, source_table, destination_table, source_node=None, destination_node=None ): """Check that source and destination table data matches.""" - source_data = select_all_ordered(source_table, source_node) - destination_data = select_all_ordered(destination_table, destination_node) + if source_node is None: + source_node = self.context.node + if destination_node is None: + destination_node = self.context.node + + source_data = select_all_ordered(table_name=source_table, node=source_node) + destination_data = select_all_ordered( + table_name=destination_table, node=destination_node + ) assert source_data == destination_data, error() diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 6f67bef1a..cf148ac10 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -8,11 +8,11 @@ # part_log is where to look # overwrite file # max bandwidth - # some of system.events stuff wont appear unless i set this maybe? just a guess +# some of system.events stuff wont appear unless i set this maybe? just a guess # system.events - # Export row in system.metrics?? - # partsexports incrementing correctly - # duplicates incrementing correctly +# Export row in system.metrics?? +# partsexports incrementing correctly +# duplicates incrementing correctly @TestScenario @@ -59,4 +59,4 @@ def part_exports(self): def feature(self): """Check system monitoring of export events.""" - Scenario(run=part_exports) \ No newline at end of file + Scenario(run=part_exports) From f4b428d2db7ceb1de7615c6c993730e8e9166db8 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 31 Oct 2025 17:12:54 -0400 Subject: [PATCH 51/99] System monitoring time woohoo --- s3/tests/export_part/clusters_nodes.py | 2 -- s3/tests/export_part/error_handling.py | 33 ------------------ s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 4 ++- s3/tests/export_part/steps.py | 12 ++++--- s3/tests/export_part/system_monitoring.py | 41 ++++++++++++++++++++++- 6 files changed, 52 insertions(+), 42 deletions(-) diff --git a/s3/tests/export_part/clusters_nodes.py b/s3/tests/export_part/clusters_nodes.py index 5905f0392..b7e280e93 100644 --- a/s3/tests/export_part/clusters_nodes.py +++ b/s3/tests/export_part/clusters_nodes.py @@ -1,5 +1,3 @@ -import random - from itertools import combinations from testflows.core import * from testflows.asserts import error diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index 09cc4f421..e40221bac 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -38,38 +38,6 @@ def invalid_part_name(self): ), error() -@TestScenario -def duplicate_exports(self): - """Check duplicate exports are ignored and not exported again.""" - - with Given("I create a populated source table and empty S3 table"): - partitioned_merge_tree_table( - table_name="source", - partition_by="p", - columns=default_columns(), - stop_merges=True, - ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - - with When("I try to export the parts twice"): - export_parts( - source_table="source", - destination_table=s3_table_name, - node=self.context.node, - ) - export_parts( - source_table="source", - destination_table=s3_table_name, - node=self.context.node, - ) - - with Then("Check source matches destination"): - source_matches_destination( - source_table="source", - destination_table=s3_table_name, - ) - - @TestScenario @Requirements(RQ_ClickHouse_ExportPart_Restrictions_SameTable("1.0")) def same_table(self): @@ -196,7 +164,6 @@ def feature(self): """Check correct error handling when exporting parts.""" Scenario(run=invalid_part_name) - Scenario(run=duplicate_exports) Scenario(run=same_table) Scenario(run=local_table) Scenario(run=disable_export_setting) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index c6e4d906c..a156547e0 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -14,7 +14,7 @@ def minio(self, uri, bucket_prefix): # Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) - # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) # Feature(run=load("s3.tests.export_part.datatypes", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 11e900572..47d2c1de0 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -4,7 +4,9 @@ from helpers.create import * from helpers.queries import * from s3.requirements.export_part import * -from alter.table.replace_partition.partition_types import * +from alter.table.replace_partition.partition_types import ( + table_with_compact_and_wide_parts, +) @TestScenario diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 8df8b65d3..fd0875c7f 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -4,6 +4,7 @@ from helpers.create import * from helpers.queries import * from s3.tests.common import temporary_bucket_path +import json def default_columns(simple=True, partition_key_type="UInt8"): @@ -152,7 +153,6 @@ def export_parts( return output -# TODO find the simplest way to parse the output @TestStep(When) def get_export_events(self, node): """Get the export data from the system.events table of a given node.""" @@ -161,9 +161,13 @@ def get_export_events(self, node): "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0, ).output - # return {row.name: int(row.value) for row in json.loads(output)} - # return [json.loads(row) for row in output.splitlines()] - return output + + events = {} + for line in output.strip().splitlines(): + event = json.loads(line) + events[event["name"]] = int(event["value"]) + + return events @TestStep(Then) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index cf148ac10..51fdce972 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -53,10 +53,49 @@ def part_exports(self): # assert final_exports - initial_exports == num_parts, error() +@TestScenario +def duplicate_exports(self): + """Check duplicate exports are ignored and not exported again.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts twice"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + # with And("I read the initial export events"): + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + with And("Check logs for duplicate exports"): + export_events = get_export_events(node=self.context.node) + note(export_events["PartsExports"]) + + @TestFeature @Name("system monitoring") @Requirements(RQ_ClickHouse_ExportPart_Logging("1.0")) def feature(self): """Check system monitoring of export events.""" - Scenario(run=part_exports) + # Scenario(run=part_exports) + Scenario(run=duplicate_exports) From 0c14e38cea1d7eeb26540977af040866959b800b Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 3 Nov 2025 09:18:41 -0500 Subject: [PATCH 52/99] Datatypes change --- s3/tests/export_part/datatypes.py | 21 +++++++++++++++++---- s3/tests/export_part/feature.py | 4 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index 626f1f895..6ff80dc07 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -6,6 +6,19 @@ from s3.requirements.export_part import * +@TestStep(When) +def insert_all_datatypes(self, table_name, rows=1, num_parts=1, node=None): + """Insert all datatypes into a MergeTree table.""" + + if node is None: + node = self.context.node + + for part in range(num_parts): + node.query( + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" + ) + + @TestStep(Given) def create_merge_tree_all_valid_partition_key_types( self, column_name, cluster=None, node=None, rows=1 @@ -27,9 +40,7 @@ def create_merge_tree_all_valid_partition_key_types( ) with And("I insert compact and wide parts into the table"): - node.query( - f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" - ) + insert_all_datatypes(table_name=table_name, rows=rows, num_parts=self.context.num_parts, node=node) return table_name @@ -98,8 +109,10 @@ def valid_partition_key_types_wide(self): RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0"), RQ_ClickHouse_ExportPart_PartTypes("1.0"), ) -def feature(self): +def feature(self, num_parts=10): """Check that all data types are supported when exporting parts.""" + self.context.num_parts = num_parts + Scenario(run=valid_partition_key_types_compact) Scenario(run=valid_partition_key_types_wide) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index a156547e0..515fcfc52 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -14,8 +14,8 @@ def minio(self, uri, bucket_prefix): # Feature(run=load("s3.tests.export_part.sanity", "feature")) # Feature(run=load("s3.tests.export_part.error_handling", "feature")) - Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) # Feature(run=load("s3.tests.export_part.engines", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) From 9b8b6ecdad452e8c0cbb051032e1772ce9680017 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 3 Nov 2025 09:38:26 -0500 Subject: [PATCH 53/99] Uncomment tests --- s3/tests/export_part/feature.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 515fcfc52..dd086c48a 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -12,10 +12,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines", "feature")) + Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines", "feature")) Feature(run=load("s3.tests.export_part.datatypes", "feature")) - # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) From a956d91def068907a634c00612bf4a87e4d13cf7 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 3 Nov 2025 09:57:29 -0500 Subject: [PATCH 54/99] Starting to link export part tests with requirements. --- s3/tests/export_part/datatypes.py | 11 ++++++++--- s3/tests/export_part/feature.py | 10 +++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index 6ff80dc07..ddc68f467 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -15,8 +15,8 @@ def insert_all_datatypes(self, table_name, rows=1, num_parts=1, node=None): for part in range(num_parts): node.query( - f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" - ) + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" + ) @TestStep(Given) @@ -40,7 +40,12 @@ def create_merge_tree_all_valid_partition_key_types( ) with And("I insert compact and wide parts into the table"): - insert_all_datatypes(table_name=table_name, rows=rows, num_parts=self.context.num_parts, node=node) + insert_all_datatypes( + table_name=table_name, + rows=rows, + num_parts=self.context.num_parts, + node=node, + ) return table_name diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index dd086c48a..db07f02df 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -1,10 +1,14 @@ from testflows.core import * - - -# TODO large data export? which file should it go in? +from s3.requirements.export_part import * @TestFeature +@Specifications( + SRS_015_ClickHouse_Export_Part_to_S3, +) +@Requirements( + RQ_ClickHouse_ExportPart_S3("1.0"), +) @Name("export parts") def minio(self, uri, bucket_prefix): """Run features from the export parts suite.""" From 6ec3d9cbaf95fcde2cdb88c92381617322efeaff Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 3 Nov 2025 10:26:32 -0500 Subject: [PATCH 55/99] Expanding engines tables combo test. --- s3/tests/export_part/engines.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 43dea55e4..e10c313f5 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -60,13 +60,20 @@ def table_combos(self): partitioned_graphite_merge_tree_table, ] # TODO expand combos - number_of_partitions = [5] - number_of_parts = [1] + number_of_partitions = [5] if not self.context.stress else [5, 10, 20] + number_of_parts = [1] if not self.context.stress else [1, 2, 5] - configured_table( - table_engine=either(*tables), - number_of_partitions=either(*number_of_partitions), - number_of_parts=either(*number_of_parts), + table_engine = either(*tables) + number_of_partitions = either(*number_of_partitions) + number_of_parts = either(*number_of_parts) + + Combination( + name=f"{table_engine.__name__} partitions={number_of_partitions} parts={number_of_parts}", + test=configured_table, + )( + table_engine=table_engine, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, ) From 1d04e45bfa96a06ed95a2bb915c9608c9ee26744 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 3 Nov 2025 11:29:21 -0500 Subject: [PATCH 56/99] Verbose query output --- s3/tests/export_part/concurrency_networks.py | 6 +++--- s3/tests/export_part/steps.py | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index b4e8a97e2..e2474709c 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -291,6 +291,6 @@ def feature(self): interruption_probability=40, recovery_probability=70 ) Scenario(test=packet_corruption)(percent_corrupt=50) - Scenario(test=packet_duplication)(percent_duplicated=50) # How do I make this fail? - Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # And this? - Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Am I using this right lol + Scenario(test=packet_duplication)(percent_duplicated=50) + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + Scenario(test=packet_rate_limit)(rate_mbit=0.05) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index fd0875c7f..d1dab421a 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -99,7 +99,9 @@ def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" output = node.query( - f"SELECT name FROM system.parts WHERE table = '{table_name}'", exitcode=0 + f"SELECT name FROM system.parts WHERE table = '{table_name}'", + exitcode=0, + steps=True, ).output return [row.strip() for row in output.splitlines()] @@ -130,6 +132,7 @@ def export_parts( f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", exitcode=exitcode, no_checks=no_checks, + steps=True, ) ) elif explicit_set == 0: @@ -139,6 +142,7 @@ def export_parts( settings=[("allow_experimental_export_merge_tree_part", 1)], exitcode=exitcode, no_checks=no_checks, + steps=True, ) ) elif explicit_set == -1: @@ -147,6 +151,7 @@ def export_parts( f"SET allow_experimental_export_merge_tree_part = 0; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", exitcode=exitcode, no_checks=no_checks, + steps=True, ) ) @@ -160,6 +165,7 @@ def get_export_events(self, node): output = node.query( "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0, + steps=True, ).output events = {} From 8acaaf6d16c2422745290b5b19e3b567fb8e4918 Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 3 Nov 2025 11:50:32 -0500 Subject: [PATCH 57/99] Large export --- s3/tests/export_part/engines.py | 4 ++-- s3/tests/export_part/sanity.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index e10c313f5..6e5d712c7 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -60,8 +60,8 @@ def table_combos(self): partitioned_graphite_merge_tree_table, ] # TODO expand combos - number_of_partitions = [5] if not self.context.stress else [5, 10, 20] - number_of_parts = [1] if not self.context.stress else [1, 2, 5] + number_of_partitions = [5] if not self.context.stress else [1, 5, 10] + number_of_parts = [1] if not self.context.stress else [1, 5, 10] table_engine = either(*tables) number_of_partitions = either(*number_of_partitions) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 47d2c1de0..15777e6d7 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -203,6 +203,34 @@ def wide_and_compact_parts(self): ) +@TestScenario +def large_export(self): + """Test exporting a large part.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + number_of_parts=100, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + @TestFeature @Name("sanity") def feature(self): @@ -213,4 +241,6 @@ def feature(self): Scenario(run=no_partition_by) Scenario(run=mismatched_columns) Scenario(run=wide_and_compact_parts) + if self.context.stress: + Scenario(run=large_export) # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting From ef6dc9353bdc7f1af7192ff2fed1892d443bcc7d Mon Sep 17 00:00:00 2001 From: julian Date: Mon, 3 Nov 2025 17:49:49 -0500 Subject: [PATCH 58/99] Trying to restart minio --- s3/tests/export_part/concurrency_networks.py | 147 +++++++++++++++++++ s3/tests/export_part/datatypes.py | 16 +- s3/tests/export_part/engines.py | 1 - s3/tests/export_part/steps.py | 9 ++ 4 files changed, 164 insertions(+), 9 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index e2474709c..793b856e3 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -278,6 +278,149 @@ def packet_rate_limit(self, rate_mbit): ) +@TestScenario +def concurrent_insert(self): + """Check that exports work correctly with concurrent inserts of source data.""" + + with Given("I create an empty source and S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=False, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When( + "I insert data and export it in parallel", + description=""" + 5 partitions with 1 part each are inserted. + The export is queued in parallel and usually behaves by exporting + a snapshot of the source data, often getting just the first partition + which means the export happens right after the first INSERT query completes. + """, + ): + Step(test=create_partitions_with_random_uint64, parallel=True)( + table_name="source", + number_of_partitions=5, + number_of_parts=1, + ) + Step(test=export_parts, parallel=True)( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + join() + + with Then("Destination data should be a subset of source data"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + assert set(source_data) >= set(destination_data), error() + + with And("Inserts should have completed successfully"): + assert len(source_data) == 15, error() + + +@TestScenario +def export_and_drop(self): + """Check that dropping a column immediately after export works correctly.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + # s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + # drop_column( + # node=self.context.node, + # table_name="source", + # column_name="i", + # ) + + # with When("I export data then drop a column"): + # export_parts( + # source_table="source", + # destination_table=s3_table_name, + # node=self.context.node, + # ) + # drop_column( + # node=self.context.node, + # table_name="source", + # column_name="i", + # ) + # This drop freezes the test ☠️☠️☠️ + + +@TestStep(When) +def kill_minio(self, cluster=None, container_name="minio1", signal="KILL"): + """Forcefully kill MinIO container to simulate network crash.""" + + if cluster is None: + cluster = self.context.cluster + + retry(cluster.command, 5)( + None, + f"docker kill --signal={signal} {container_name}", + timeout=60, + exitcode=0, + steps=False, + ) + + +@TestStep(When) +def restart_minio(self, cluster=None, container_name="minio1", timeout=300): + """Restart MinIO container after it was killed.""" + + if cluster is None: + cluster = self.context.cluster + + retry(cluster.command, 5)( + None, + f"docker start {container_name}", + timeout=timeout, + exitcode=0, + steps=False, + ) + + +@TestScenario +def kill_and_restart_minio( + self, cluster=None, container_name="s3_env-minio1-1", signal="KILL", timeout=300 +): + """Check that restarting ClickHouse after exporting data works correctly.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export data and restart MinIO in parallel"): + Step(test=export_parts, parallel=True)( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + Step(test=kill_minio, parallel=True)( + cluster=cluster, container_name=container_name, signal=signal + ) + join() + restart_minio(cluster=cluster, container_name=container_name, timeout=timeout) + + with Then("The export should complete successfully"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + @TestFeature @Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) @Name("concurrency and networks") @@ -294,3 +437,7 @@ def feature(self): Scenario(test=packet_duplication)(percent_duplicated=50) Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) Scenario(test=packet_rate_limit)(rate_mbit=0.05) + Scenario(run=concurrent_insert) + + # Scenario(run=export_and_drop) + # Scenario(run=kill_and_restart_minio) diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index ddc68f467..f1ee4584b 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -7,7 +7,7 @@ @TestStep(When) -def insert_all_datatypes(self, table_name, rows=1, num_parts=1, node=None): +def insert_all_datatypes(self, table_name, rows_per_part=1, num_parts=1, node=None): """Insert all datatypes into a MergeTree table.""" if node is None: @@ -15,13 +15,13 @@ def insert_all_datatypes(self, table_name, rows=1, num_parts=1, node=None): for part in range(num_parts): node.query( - f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows})" + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows_per_part})" ) @TestStep(Given) def create_merge_tree_all_valid_partition_key_types( - self, column_name, cluster=None, node=None, rows=1 + self, column_name, cluster=None, node=None, rows_per_part=1 ): """Create a MergeTree table with all valid partition key types and both wide and compact parts.""" @@ -42,7 +42,7 @@ def create_merge_tree_all_valid_partition_key_types( with And("I insert compact and wide parts into the table"): insert_all_datatypes( table_name=table_name, - rows=rows, + rows_per_part=rows_per_part, num_parts=self.context.num_parts, node=node, ) @@ -51,7 +51,7 @@ def create_merge_tree_all_valid_partition_key_types( @TestCheck -def valid_partition_key_table(self, partition_key_type, rows=1): +def valid_partition_key_table(self, partition_key_type, rows_per_part=1): """Check exporting to a source table with specified valid partition key type and rows.""" with Given( @@ -59,7 +59,7 @@ def valid_partition_key_table(self, partition_key_type, rows=1): ): table_name = create_merge_tree_all_valid_partition_key_types( column_name=partition_key_type, - rows=rows, + rows_per_part=rows_per_part, ) s3_table_name = create_s3_table( table_name="s3", @@ -96,7 +96,7 @@ def valid_partition_key_types_compact(self): """Check that all partition key data types are supported when exporting compact parts.""" key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] - valid_partition_key_table(partition_key_type=either(*key_types), rows=1) + valid_partition_key_table(partition_key_type=either(*key_types), rows_per_part=1) @TestSketch(Scenario) @@ -105,7 +105,7 @@ def valid_partition_key_types_wide(self): """Check that all partition key data types are supported when exporting wide parts.""" key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] - valid_partition_key_table(partition_key_type=either(*key_types), rows=100) + valid_partition_key_table(partition_key_type=either(*key_types), rows_per_part=100) @TestFeature diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines.py index 6e5d712c7..b6a0c5825 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines.py @@ -59,7 +59,6 @@ def table_combos(self): partitioned_aggregating_merge_tree_table, partitioned_graphite_merge_tree_table, ] - # TODO expand combos number_of_partitions = [5] if not self.context.stress else [1, 5, 10] number_of_parts = [1] if not self.context.stress else [1, 5, 10] diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index d1dab421a..2e3abb8b2 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -176,6 +176,15 @@ def get_export_events(self, node): return events +@TestStep(When) +def drop_column(self, node, table_name, column_name): + """Drop a column from a table.""" + + node.query( + f"ALTER TABLE {table_name} DROP COLUMN {column_name}", exitcode=0, steps=True + ) + + @TestStep(Then) def source_matches_destination( self, source_table, destination_table, source_node=None, destination_node=None From 9f41cfe7a35ba406cc6edfa03ad6048a36805b1a Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 4 Nov 2025 13:27:26 -0500 Subject: [PATCH 59/99] Disks, volumes, storage_policies --- helpers/create.py | 2 + s3/configs/clickhouse/config.d/storage.xml | 82 +++++++++++++++++++ s3/s3_env/clickhouse-service.yml | 1 + s3/tests/export_part/concurrency_networks.py | 32 ++++---- .../{engines.py => engines_volumes.py} | 76 +++++++++++++++-- s3/tests/export_part/feature.py | 12 +-- 6 files changed, 174 insertions(+), 31 deletions(-) create mode 100644 s3/configs/clickhouse/config.d/storage.xml rename s3/tests/export_part/{engines.py => engines_volumes.py} (55%) diff --git a/helpers/create.py b/helpers/create.py index 4962a8859..d07d36937 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -408,6 +408,7 @@ def partitioned_merge_tree_table( populate=True, number_of_partitions=5, number_of_parts=1, + query_settings=None, ): """Create a MergeTree table partitioned by a specific column.""" with By(f"creating a partitioned {table_name} table with a MergeTree engine"): @@ -417,6 +418,7 @@ def partitioned_merge_tree_table( partition_by=partition_by, cluster=cluster, stop_merges=stop_merges, + query_settings=query_settings, ) if populate: diff --git a/s3/configs/clickhouse/config.d/storage.xml b/s3/configs/clickhouse/config.d/storage.xml new file mode 100644 index 000000000..822eb90a7 --- /dev/null +++ b/s3/configs/clickhouse/config.d/storage.xml @@ -0,0 +1,82 @@ + + + + + /jbod1/ + + + /jbod2/ + + + /jbod3/ + + + /jbod4/ + + + /external/ + + + /external2/ + + + + + +
+ jbod1 +
+
+
+ + +
+ jbod2 +
+
+
+ + +
+ jbod3 +
+
+
+ + +
+ jbod4 +
+
+
+ + +
+ external +
+
+
+ + +
+ external2 +
+
+
+ + + + jbod1 + jbod2 + 2048 + + + external + external2 + + + 0.7 + +
+
+
\ No newline at end of file diff --git a/s3/s3_env/clickhouse-service.yml b/s3/s3_env/clickhouse-service.yml index c766d2085..0745cc3e1 100755 --- a/s3/s3_env/clickhouse-service.yml +++ b/s3/s3_env/clickhouse-service.yml @@ -21,6 +21,7 @@ services: - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/ssl.xml:/etc/clickhouse-server/config.d/ssl.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/system_unfreeze.xml:/etc/clickhouse-server/config.d/system_unfreeze.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/ssl:/etc/clickhouse-server/ssl" + - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/storage.xml:/etc/clickhouse-server/config.d/storage.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/users.d/output_format_json_quote_64bit_integers.xml:/etc/clickhouse-server/users.d/output_format_json_quote_64bit_integers.xml" cap_add: - NET_ADMIN \ No newline at end of file diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 793b856e3..cef12c43f 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -327,7 +327,7 @@ def concurrent_insert(self): @TestScenario def export_and_drop(self): """Check that dropping a column immediately after export works correctly.""" - + pause() with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -373,7 +373,7 @@ def kill_minio(self, cluster=None, container_name="minio1", signal="KILL"): @TestStep(When) -def restart_minio(self, cluster=None, container_name="minio1", timeout=300): +def restart_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): """Restart MinIO container after it was killed.""" if cluster is None: @@ -384,7 +384,7 @@ def restart_minio(self, cluster=None, container_name="minio1", timeout=300): f"docker start {container_name}", timeout=timeout, exitcode=0, - steps=False, + steps=True, ) @@ -427,17 +427,17 @@ def kill_and_restart_minio( def feature(self): """Check that exports work correctly with concurrency and various network conditions.""" - Scenario(test=basic_concurrent_export)(threads=5) - Scenario(test=packet_delay)(delay_ms=100) - Scenario(test=packet_loss)(percent_loss=50) - Scenario(test=packet_loss_gemodel)( - interruption_probability=40, recovery_probability=70 - ) - Scenario(test=packet_corruption)(percent_corrupt=50) - Scenario(test=packet_duplication)(percent_duplicated=50) - Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) - Scenario(test=packet_rate_limit)(rate_mbit=0.05) - Scenario(run=concurrent_insert) - - # Scenario(run=export_and_drop) + # Scenario(test=basic_concurrent_export)(threads=5) + # Scenario(test=packet_delay)(delay_ms=100) + # Scenario(test=packet_loss)(percent_loss=50) + # Scenario(test=packet_loss_gemodel)( + # interruption_probability=40, recovery_probability=70 + # ) + # Scenario(test=packet_corruption)(percent_corrupt=50) + # Scenario(test=packet_duplication)(percent_duplicated=50) + # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + # Scenario(test=packet_rate_limit)(rate_mbit=0.05) + # Scenario(run=concurrent_insert) + + Scenario(run=export_and_drop) # Scenario(run=kill_and_restart_minio) diff --git a/s3/tests/export_part/engines.py b/s3/tests/export_part/engines_volumes.py similarity index 55% rename from s3/tests/export_part/engines.py rename to s3/tests/export_part/engines_volumes.py index b6a0c5825..04e298c7b 100644 --- a/s3/tests/export_part/engines.py +++ b/s3/tests/export_part/engines_volumes.py @@ -34,15 +34,12 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): node=self.context.node, ) - with And("I read data from both tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - destination_data = select_all_ordered( - table_name=s3_table_name, node=self.context.node + with Then("Source and destination tables should match"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, ) - with Then("They should be the same"): - assert source_data == destination_data, error() - @TestSketch(Scenario) @Flags(TE) @@ -76,9 +73,70 @@ def table_combos(self): ) +@TestCheck +def configured_volume(self, volume): + """Test a specific combination of volume.""" + + with Given(f"I create an empty source table on volume {volume} and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + query_settings=f"storage_policy = '{volume}'", + populate=False, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I populate the source table with parts exceeding 2KB each"): + create_partitions_with_random_uint64( + table_name="source", + node=self.context.node, + number_of_values=500, + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Source and destination tables should match"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestSketch(Scenario) +@Flags(TE) +def volume_combos(self): + """Test exporting to various storage policies.""" + + volumes = [ + "jbod1", + "jbod2", + "jbod3", + "jbod4", + "external", + "external2", + "tiered_storage", + ] + volume = either(*volumes) + + Combination( + name=f"volume={volume}", + test=configured_volume, + )( + volume=volume, + ) + + @TestFeature -@Name("engines") +@Name("engines and volumes") def feature(self): - """Check exporting parts to S3 storage with different table engines.""" + """Check exporting parts to S3 storage with different table engines and volumes.""" Scenario(run=table_combos) + Scenario(run=volume_combos) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index db07f02df..736bef797 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -16,10 +16,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines", "feature")) - Feature(run=load("s3.tests.export_part.datatypes", "feature")) - Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) From ce9b2591a2526b4052fc5f3392a49d93a7b65748 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 4 Nov 2025 15:13:17 -0500 Subject: [PATCH 60/99] minio kill, export, minio start --- s3/tests/export_part/concurrency_networks.py | 71 ++++++++++++-------- s3/tests/export_part/feature.py | 4 +- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index cef12c43f..42cfd8bef 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -357,12 +357,12 @@ def export_and_drop(self): @TestStep(When) -def kill_minio(self, cluster=None, container_name="minio1", signal="KILL"): +def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): """Forcefully kill MinIO container to simulate network crash.""" - + if cluster is None: cluster = self.context.cluster - + retry(cluster.command, 5)( None, f"docker kill --signal={signal} {container_name}", @@ -373,26 +373,38 @@ def kill_minio(self, cluster=None, container_name="minio1", signal="KILL"): @TestStep(When) -def restart_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): - """Restart MinIO container after it was killed.""" - +def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): + """Start MinIO container and wait for it to be ready.""" + if cluster is None: cluster = self.context.cluster - - retry(cluster.command, 5)( - None, - f"docker start {container_name}", - timeout=timeout, - exitcode=0, - steps=True, - ) + + with By("starting MinIO container"): + retry(cluster.command, 5)( + None, + f"docker start {container_name}", + timeout=timeout, + exitcode=0, + steps=True, + ) + + with And("waiting for MinIO to be ready"): + for attempt in retries(timeout=timeout, delay=1): + with attempt: + result = cluster.command( + None, + f"docker exec {container_name} curl -f http://localhost:9001/minio/health/live", + timeout=10, + steps=False, + no_checks=True, + ) + if result.exitcode != 0: + fail("MinIO health check failed") @TestScenario -def kill_and_restart_minio( - self, cluster=None, container_name="s3_env-minio1-1", signal="KILL", timeout=300 -): - """Check that restarting ClickHouse after exporting data works correctly.""" +def restart_minio(self): + """Check that restarting MinIO after exporting data works correctly.""" with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( @@ -402,19 +414,20 @@ def kill_and_restart_minio( ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - with When("I export data and restart MinIO in parallel"): - Step(test=export_parts, parallel=True)( + with And("I kill MinIO"): + kill_minio() + + with When("I export data"): + export_parts( source_table="source", destination_table=s3_table_name, node=self.context.node, ) - Step(test=kill_minio, parallel=True)( - cluster=cluster, container_name=container_name, signal=signal - ) - join() - restart_minio(cluster=cluster, container_name=container_name, timeout=timeout) - with Then("The export should complete successfully"): + with And("I restart MinIO"): + start_minio() + + with Then("Check source matches destination"): source_matches_destination( source_table="source", destination_table=s3_table_name, @@ -438,6 +451,6 @@ def feature(self): # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Scenario(run=concurrent_insert) - - Scenario(run=export_and_drop) - # Scenario(run=kill_and_restart_minio) + # pause() + # Scenario(run=export_and_drop) + Scenario(run=restart_minio) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 736bef797..1ee9b4919 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -20,6 +20,6 @@ def minio(self, uri, bucket_prefix): # Feature(run=load("s3.tests.export_part.error_handling", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) # Feature(run=load("s3.tests.export_part.datatypes", "feature")) - # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) From e46c11148df9303107d886fcb91f97177c5bb179 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 4 Nov 2025 15:21:54 -0500 Subject: [PATCH 61/99] all working tests --- s3/tests/export_part/concurrency_networks.py | 28 +++++++++++--------- s3/tests/export_part/feature.py | 12 ++++----- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 42cfd8bef..2f99368f0 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -440,17 +440,19 @@ def restart_minio(self): def feature(self): """Check that exports work correctly with concurrency and various network conditions.""" - # Scenario(test=basic_concurrent_export)(threads=5) - # Scenario(test=packet_delay)(delay_ms=100) - # Scenario(test=packet_loss)(percent_loss=50) - # Scenario(test=packet_loss_gemodel)( - # interruption_probability=40, recovery_probability=70 - # ) - # Scenario(test=packet_corruption)(percent_corrupt=50) - # Scenario(test=packet_duplication)(percent_duplicated=50) - # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) - # Scenario(test=packet_rate_limit)(rate_mbit=0.05) - # Scenario(run=concurrent_insert) - # pause() - # Scenario(run=export_and_drop) + # TODO corruption (bit flipping) + + Scenario(test=basic_concurrent_export)(threads=5) + Scenario(test=packet_delay)(delay_ms=100) + Scenario(test=packet_loss)(percent_loss=50) + Scenario(test=packet_loss_gemodel)( + interruption_probability=40, recovery_probability=70 + ) + Scenario(test=packet_corruption)(percent_corrupt=50) + Scenario(test=packet_duplication)(percent_duplicated=50) + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + Scenario(test=packet_rate_limit)(rate_mbit=0.05) + Scenario(run=concurrent_insert) Scenario(run=restart_minio) + + # Scenario(run=export_and_drop) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 1ee9b4919..783e661e5 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -16,10 +16,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) - # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) From af9bc02c865e56cc2aa795b9ce87a8a1382d4bd2 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Wed, 5 Nov 2025 11:07:47 -0500 Subject: [PATCH 62/99] Starting to work on export partition.xy --- s3/regression.py | 3 + s3/tests/export_partition/clusters_nodes.py | 77 +++ .../export_partition/concurrency_networks.py | 458 ++++++++++++++++++ s3/tests/export_partition/datatypes.py | 123 +++++ s3/tests/export_partition/engines_volumes.py | 142 ++++++ s3/tests/export_partition/error_handling.py | 170 +++++++ s3/tests/export_partition/feature.py | 20 + s3/tests/export_partition/sanity.py | 244 ++++++++++ s3/tests/export_partition/steps.py | 203 ++++++++ .../export_partition/system_monitoring.py | 101 ++++ 10 files changed, 1541 insertions(+) create mode 100644 s3/tests/export_partition/clusters_nodes.py create mode 100644 s3/tests/export_partition/concurrency_networks.py create mode 100644 s3/tests/export_partition/datatypes.py create mode 100644 s3/tests/export_partition/engines_volumes.py create mode 100644 s3/tests/export_partition/error_handling.py create mode 100644 s3/tests/export_partition/feature.py create mode 100644 s3/tests/export_partition/sanity.py create mode 100644 s3/tests/export_partition/steps.py create mode 100644 s3/tests/export_partition/system_monitoring.py diff --git a/s3/regression.py b/s3/regression.py index ff65e6041..661ca8a26 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -613,6 +613,9 @@ def minio_regression( Feature(test=load("s3.tests.export_part.feature", "minio"))( uri=uri_bucket_file, bucket_prefix=bucket_prefix ) + Feature(test=load("s3.tests.export_partition.feature", "minio"))( + uri=uri_bucket_file, bucket_prefix=bucket_prefix + ) @TestFeature diff --git a/s3/tests/export_partition/clusters_nodes.py b/s3/tests/export_partition/clusters_nodes.py new file mode 100644 index 000000000..b7e280e93 --- /dev/null +++ b/s3/tests/export_partition/clusters_nodes.py @@ -0,0 +1,77 @@ +from itertools import combinations +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * +from helpers.queries import * +from alter.table.replace_partition.common import create_partitions_with_random_uint64 + + +@TestScenario +def different_nodes_same_destination(self, cluster, node1, node2): + """Test export part from different nodes to same S3 destination in a given cluster.""" + + with Given("I create an empty source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=False, + cluster=cluster, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, cluster=cluster + ) + + with And("I populate the source tables on both nodes"): + create_partitions_with_random_uint64(table_name="source", node=node1) + create_partitions_with_random_uint64(table_name="source", node=node2) + + with When("I export parts to the S3 table from both nodes"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=node1, + ) + export_parts( + source_table="source", + destination_table=s3_table_name, + node=node2, + ) + + with And("I read data from all tables on both nodes"): + source_data1 = select_all_ordered(table_name="source", node=node1) + source_data2 = select_all_ordered(table_name="source", node=node2) + destination_data1 = select_all_ordered(table_name=s3_table_name, node=node1) + destination_data2 = select_all_ordered(table_name=s3_table_name, node=node2) + + with Then( + "Destination data should be comprised of data from both sources, and identical on both nodes" + ): + assert set(destination_data1) == set(source_data1) | set(source_data2), error() + assert set(destination_data2) == set(source_data1) | set(source_data2), error() + + +@TestFeature +@Name("clusters and nodes") +def feature(self): + """Check functionality of exporting data parts to S3 storage from different clusters and nodes.""" + + clusters = [ + "sharded_cluster", + "replicated_cluster", + "one_shard_cluster", + "sharded_cluster12", + "one_shard_cluster12", + "sharded_cluster23", + "one_shard_cluster23", + ] + + for cluster in clusters: + with Given(f"I get nodes for cluster {cluster}"): + node_names = get_cluster_nodes(cluster=cluster) + + for node1_name, node2_name in combinations(node_names, 2): + node1 = self.context.cluster.node(node1_name) + node2 = self.context.cluster.node(node2_name) + different_nodes_same_destination(cluster=cluster, node1=node1, node2=node2) diff --git a/s3/tests/export_partition/concurrency_networks.py b/s3/tests/export_partition/concurrency_networks.py new file mode 100644 index 000000000..0f2d8936d --- /dev/null +++ b/s3/tests/export_partition/concurrency_networks.py @@ -0,0 +1,458 @@ +from testflows.core import * +from s3.tests.export_part.steps import * +from helpers.create import * +from helpers.queries import * +from s3.requirements.export_part import * +from alter.stress.tests.tc_netem import * + + +@TestScenario +def basic_concurrent_export(self, threads): + """Check concurrent exports from different sources to the same S3 table.""" + + with Given(f"I create {threads} populated source tables and an empty S3 table"): + for i in range(threads): + partitioned_merge_tree_table( + table_name=f"source{i}", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts from all sources concurrently to the S3 table"): + for i in range(threads): + Step(test=export_parts, parallel=True)( + source_table=f"source{i}", + destination_table=s3_table_name, + node=self.context.node, + ) + join() + + with And("I read data from all tables"): + source_data = [] + for i in range(threads): + data = select_all_ordered(table_name=f"source{i}", node=self.context.node) + source_data.extend(data) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("All data should be present in the S3 table"): + assert set(source_data) == set(destination_data), error() + + +@TestScenario +def packet_delay(self, delay_ms): + """Check that exports work correctly with packet delay.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply a packet delay"): + network_packet_delay(node=self.context.node, delay_ms=delay_ms) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_loss(self, percent_loss): + """Check that exports work correctly with packet loss.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet loss"): + network_packet_loss(node=self.context.node, percent_loss=percent_loss) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_loss_gemodel(self, interruption_probability, recovery_probability): + """Check that exports work correctly with packet loss using the GE model.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet loss using the GE model"): + network_packet_loss_gemodel( + node=self.context.node, + interruption_probability=interruption_probability, + recovery_probability=recovery_probability, + ) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_corruption(self, percent_corrupt): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet corruption"): + network_packet_corruption( + node=self.context.node, percent_corrupt=percent_corrupt + ) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_duplication(self, percent_duplicated): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet duplication"): + network_packet_duplication( + node=self.context.node, percent_duplicated=percent_duplicated + ) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_reordering(self, delay_ms, percent_reordered): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet reordering"): + network_packet_reordering( + node=self.context.node, + delay_ms=delay_ms, + percent_reordered=percent_reordered, + ) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def packet_rate_limit(self, rate_mbit): + """Check that exports work correctly with packet corruption.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I apply packet rate limit"): + network_packet_rate_limit(node=self.context.node, rate_mbit=rate_mbit) + + with And("I export parts from the source table to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def concurrent_insert(self): + """Check that exports work correctly with concurrent inserts of source data.""" + + with Given("I create an empty source and S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=False, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When( + "I insert data and export it in parallel", + description=""" + 5 partitions with 1 part each are inserted. + The export is queued in parallel and usually behaves by exporting + a snapshot of the source data, often getting just the first partition + which means the export happens right after the first INSERT query completes. + """, + ): + Step(test=create_partitions_with_random_uint64, parallel=True)( + table_name="source", + number_of_partitions=5, + number_of_parts=1, + ) + Step(test=export_parts, parallel=True)( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + join() + + with Then("Destination data should be a subset of source data"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + assert set(source_data) >= set(destination_data), error() + + with And("Inserts should have completed successfully"): + assert len(source_data) == 15, error() + + +@TestScenario +def export_and_drop(self): + """Check that dropping a column immediately after export works correctly.""" + pause() + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + # s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + # drop_column( + # node=self.context.node, + # table_name="source", + # column_name="i", + # ) + + # with When("I export data then drop a column"): + # export_parts( + # source_table="source", + # destination_table=s3_table_name, + # node=self.context.node, + # ) + # drop_column( + # node=self.context.node, + # table_name="source", + # column_name="i", + # ) + # This drop freezes the test ☠️☠️☠️ + + +@TestStep(When) +def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): + """Forcefully kill MinIO container to simulate network crash.""" + + if cluster is None: + cluster = self.context.cluster + + retry(cluster.command, 5)( + None, + f"docker kill --signal={signal} {container_name}", + timeout=60, + exitcode=0, + steps=False, + ) + + +@TestStep(When) +def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): + """Start MinIO container and wait for it to be ready.""" + + if cluster is None: + cluster = self.context.cluster + + with By("starting MinIO container"): + retry(cluster.command, 5)( + None, + f"docker start {container_name}", + timeout=timeout, + exitcode=0, + steps=True, + ) + + with And("waiting for MinIO to be ready"): + for attempt in retries(timeout=timeout, delay=1): + with attempt: + result = cluster.command( + None, + f"docker exec {container_name} curl -f http://localhost:9001/minio/health/live", + timeout=10, + steps=False, + no_checks=True, + ) + if result.exitcode != 0: + fail("MinIO health check failed") + + +@TestScenario +def restart_minio(self): + """Check that restarting MinIO after exporting data works correctly.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I kill MinIO"): + kill_minio() + + with When("I export data"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I restart MinIO"): + start_minio() + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestFeature +@Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) +@Name("concurrency and networks") +def feature(self): + """Check that exports work correctly with concurrency and various network conditions.""" + + # TODO corruption (bit flipping) + + Scenario(test=basic_concurrent_export)(threads=5) + Scenario(test=packet_delay)(delay_ms=100) + Scenario(test=packet_loss)(percent_loss=50) + Scenario(test=packet_loss_gemodel)( + interruption_probability=40, recovery_probability=70 + ) + Scenario(test=packet_corruption)(percent_corrupt=50) + Scenario(test=packet_duplication)(percent_duplicated=50) + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + Scenario(test=packet_rate_limit)(rate_mbit=0.05) + Scenario(run=concurrent_insert) + Scenario(run=restart_minio) + + # Scenario(run=export_and_drop) diff --git a/s3/tests/export_partition/datatypes.py b/s3/tests/export_partition/datatypes.py new file mode 100644 index 000000000..f1ee4584b --- /dev/null +++ b/s3/tests/export_partition/datatypes.py @@ -0,0 +1,123 @@ +from testflows.core import * +from s3.tests.export_part.steps import * +from helpers.create import * +from helpers.queries import * +from helpers.common import getuid +from s3.requirements.export_part import * + + +@TestStep(When) +def insert_all_datatypes(self, table_name, rows_per_part=1, num_parts=1, node=None): + """Insert all datatypes into a MergeTree table.""" + + if node is None: + node = self.context.node + + for part in range(num_parts): + node.query( + f"INSERT INTO {table_name} (int8, int16, int32, int64, uint8, uint16, uint32, uint64, date, date32, datetime, datetime64, string, fixedstring) SELECT 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, '13', '14' FROM numbers({rows_per_part})" + ) + + +@TestStep(Given) +def create_merge_tree_all_valid_partition_key_types( + self, column_name, cluster=None, node=None, rows_per_part=1 +): + """Create a MergeTree table with all valid partition key types and both wide and compact parts.""" + + if node is None: + node = self.context.node + + with By("creating a MergeTree table with all data types"): + table_name = f"table_{getuid()}" + create_merge_tree_table( + table_name=table_name, + columns=valid_partition_key_types_columns(), + partition_by=column_name, + cluster=cluster, + stop_merges=True, + query_settings=f"min_rows_for_wide_part=10", + ) + + with And("I insert compact and wide parts into the table"): + insert_all_datatypes( + table_name=table_name, + rows_per_part=rows_per_part, + num_parts=self.context.num_parts, + node=node, + ) + + return table_name + + +@TestCheck +def valid_partition_key_table(self, partition_key_type, rows_per_part=1): + """Check exporting to a source table with specified valid partition key type and rows.""" + + with Given( + f"I create a source table with valid partition key type {partition_key_type} and empty S3 table" + ): + table_name = create_merge_tree_all_valid_partition_key_types( + column_name=partition_key_type, + rows_per_part=rows_per_part, + ) + s3_table_name = create_s3_table( + table_name="s3", + create_new_bucket=True, + columns=valid_partition_key_types_columns(), + partition_by=partition_key_type, + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table=table_name, + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered( + table_name=table_name, node=self.context.node, order_by=partition_key_type + ) + destination_data = select_all_ordered( + table_name=s3_table_name, + node=self.context.node, + order_by=partition_key_type, + ) + + with Then("They should be the same"): + assert source_data == destination_data, error() + + +@TestSketch(Scenario) +@Flags(TE) +@Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) +def valid_partition_key_types_compact(self): + """Check that all partition key data types are supported when exporting compact parts.""" + + key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] + valid_partition_key_table(partition_key_type=either(*key_types), rows_per_part=1) + + +@TestSketch(Scenario) +@Flags(TE) +def valid_partition_key_types_wide(self): + """Check that all partition key data types are supported when exporting wide parts.""" + + key_types = [datatype["name"] for datatype in valid_partition_key_types_columns()] + valid_partition_key_table(partition_key_type=either(*key_types), rows_per_part=100) + + +@TestFeature +@Name("datatypes") +@Requirements( + RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0"), + RQ_ClickHouse_ExportPart_PartTypes("1.0"), +) +def feature(self, num_parts=10): + """Check that all data types are supported when exporting parts.""" + + self.context.num_parts = num_parts + + Scenario(run=valid_partition_key_types_compact) + Scenario(run=valid_partition_key_types_wide) diff --git a/s3/tests/export_partition/engines_volumes.py b/s3/tests/export_partition/engines_volumes.py new file mode 100644 index 000000000..04e298c7b --- /dev/null +++ b/s3/tests/export_partition/engines_volumes.py @@ -0,0 +1,142 @@ +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * +from s3.requirements.export_part import * +from helpers.queries import * + + +# TODO replicated merge tree tables (all types) + + +@TestCheck +def configured_table(self, table_engine, number_of_partitions, number_of_parts): + """Test a specific combination of table engine, number of partitions, and number of parts.""" + + with Given("I create a populated source table and empty S3 table"): + table_engine( + table_name="source", + partition_by="p", + stop_merges=True, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + columns=default_columns(simple=False, partition_key_type="Int8"), + ) + s3_table_name = create_s3_table( + table_name="s3", + create_new_bucket=True, + columns=default_columns(simple=False, partition_key_type="Int8"), + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Source and destination tables should match"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestSketch(Scenario) +@Flags(TE) +@Requirements(RQ_ClickHouse_ExportPart_SourceEngines("1.0")) +def table_combos(self): + """Test various combinations of table engines, number of partitions, and number of parts.""" + + tables = [ + partitioned_merge_tree_table, + partitioned_replacing_merge_tree_table, + partitioned_summing_merge_tree_table, + partitioned_collapsing_merge_tree_table, + partitioned_versioned_collapsing_merge_tree_table, + partitioned_aggregating_merge_tree_table, + partitioned_graphite_merge_tree_table, + ] + number_of_partitions = [5] if not self.context.stress else [1, 5, 10] + number_of_parts = [1] if not self.context.stress else [1, 5, 10] + + table_engine = either(*tables) + number_of_partitions = either(*number_of_partitions) + number_of_parts = either(*number_of_parts) + + Combination( + name=f"{table_engine.__name__} partitions={number_of_partitions} parts={number_of_parts}", + test=configured_table, + )( + table_engine=table_engine, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) + + +@TestCheck +def configured_volume(self, volume): + """Test a specific combination of volume.""" + + with Given(f"I create an empty source table on volume {volume} and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + query_settings=f"storage_policy = '{volume}'", + populate=False, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I populate the source table with parts exceeding 2KB each"): + create_partitions_with_random_uint64( + table_name="source", + node=self.context.node, + number_of_values=500, + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Source and destination tables should match"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestSketch(Scenario) +@Flags(TE) +def volume_combos(self): + """Test exporting to various storage policies.""" + + volumes = [ + "jbod1", + "jbod2", + "jbod3", + "jbod4", + "external", + "external2", + "tiered_storage", + ] + volume = either(*volumes) + + Combination( + name=f"volume={volume}", + test=configured_volume, + )( + volume=volume, + ) + + +@TestFeature +@Name("engines and volumes") +def feature(self): + """Check exporting parts to S3 storage with different table engines and volumes.""" + + Scenario(run=table_combos) + Scenario(run=volume_combos) diff --git a/s3/tests/export_partition/error_handling.py b/s3/tests/export_partition/error_handling.py new file mode 100644 index 000000000..e40221bac --- /dev/null +++ b/s3/tests/export_partition/error_handling.py @@ -0,0 +1,170 @@ +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * +from helpers.queries import * +from s3.requirements.export_part import * + + +@TestScenario +def invalid_part_name(self): + """Check that exporting a non-existent part returns the correct error.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I create an invalid part name"): + invalid_part_name = "in_va_lid_part" + + with When("I try to export the invalid part"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + parts=[invalid_part_name], + exitcode=1, + ) + + with Then("I should see an error related to the invalid part name"): + assert results[0].exitcode == 233, error() + assert ( + f"Unexpected part name: {invalid_part_name}" in results[0].output + ), error() + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Restrictions_SameTable("1.0")) +def same_table(self): + """Check exporting parts where source and destination tables are the same.""" + + with Given("I create a populated source table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + + with When("I try to export parts to itself"): + results = export_parts( + source_table="source", + destination_table="source", + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to same table exports"): + assert results[0].exitcode == 36, error() + assert ( + "Exporting to the same table is not allowed" in results[0].output + ), error() + + +@TestScenario +def local_table(self): + """Test exporting parts to a local table.""" + + with Given("I create a populated source table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + + with And("I create an empty local table"): + partitioned_merge_tree_table( + table_name="destination", + partition_by="p", + columns=default_columns(), + stop_merges=True, + populate=False, + ) + + with When("I export parts to the local table"): + results = export_parts( + source_table="source", + destination_table="destination", + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to local table exports"): + assert results[0].exitcode == 48, error() + assert ( + "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" + in results[0].output + ), error() + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Settings_AllowExperimental("1.0")) +def disable_export_setting(self): + """Check that exporting parts without the export setting set returns the correct error.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts with the export setting disabled"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + explicit_set=-1, + ) + + with Then("I should see an error related to the export setting"): + assert results[0].exitcode == 88, error() + assert "Exporting merge tree part is experimental" in results[0].output, error() + + +@TestScenario +def different_partition_key(self): + """Check exporting parts with a different partition key returns the correct error.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="i", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to the different partition key"): + assert results[0].exitcode == 36, error() + assert "Tables have different partition key" in results[0].output, error() + + +@TestFeature +@Name("error handling") +@Requirements(RQ_ClickHouse_ExportPart_FailureHandling("1.0")) +def feature(self): + """Check correct error handling when exporting parts.""" + + Scenario(run=invalid_part_name) + Scenario(run=same_table) + Scenario(run=local_table) + Scenario(run=disable_export_setting) + Scenario(run=different_partition_key) diff --git a/s3/tests/export_partition/feature.py b/s3/tests/export_partition/feature.py new file mode 100644 index 000000000..fb96dc454 --- /dev/null +++ b/s3/tests/export_partition/feature.py @@ -0,0 +1,20 @@ +from testflows.core import * + + +@TestFeature +@Specifications() +@Requirements() +@Name("export partition") +def minio(self, uri, bucket_prefix): + """Export partition suite.""" + + self.context.uri_base = uri + self.context.bucket_prefix = bucket_prefix + + Feature(run=load("s3.tests.export_partition.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) + # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) diff --git a/s3/tests/export_partition/sanity.py b/s3/tests/export_partition/sanity.py new file mode 100644 index 000000000..98414e622 --- /dev/null +++ b/s3/tests/export_partition/sanity.py @@ -0,0 +1,244 @@ +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * +from helpers.create import * +from helpers.common import getuid +from helpers.queries import * +from s3.requirements.export_part import * +from alter.table.replace_partition.partition_types import ( + table_with_compact_and_wide_parts, +) + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Settings_AllowExperimental("1.0")) +def export_setting(self): + """Check that the export setting is settable in 2 ways when exporting parts.""" + + with Given("I create a populated source table and 2 empty S3 tables"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name1 = create_s3_table(table_name="s3_1", create_new_bucket=True) + s3_table_name2 = create_s3_table(table_name="s3_2") + + with When("I export parts to the first S3 table using the SET query"): + export_parts( + source_table="source", + destination_table=s3_table_name1, + node=self.context.node, + explicit_set=1, + ) + + with And("I export parts to the second S3 table using the settings argument"): + export_parts( + source_table="source", + destination_table=s3_table_name2, + node=self.context.node, + explicit_set=0, + ) + + with And("I read data from all tables"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data1 = select_all_ordered( + table_name=s3_table_name1, node=self.context.node + ) + destination_data2 = select_all_ordered( + table_name=s3_table_name2, node=self.context.node + ) + + with Then("All tables should have the same data"): + assert source_data == destination_data1, error() + assert source_data == destination_data2, error() + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_SchemaCompatibility("1.0")) +def mismatched_columns(self): + """Test exporting parts when source and destination tables have mismatched columns.""" + + with Given("I create a source table and S3 table with different columns"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table( + table_name="s3", + create_new_bucket=True, + columns=default_columns(simple=False), + ) + + with When("I export parts to the S3 table"): + results = export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + ) + + with Then("I should see an error related to mismatched columns"): + assert results[0].exitcode == 122, error() + assert "Tables have different structure" in results[0].output, error() + + +@TestScenario +@Requirements() +def basic_table(self): + """Test exporting partitions of a basic table.""" + + with Given("I create a populated source table and empty S3 table"): + source_table = partitioned_replicated_merge_tree_table( + table_name=f"source_{getuid()}", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export partitions to the S3 table"): + export_partitions( + source_table=source_table, + destination_table=s3_table, + node=self.context.node, + ) + + with Then("Check source matches destination"): + source_matches_destination( + source_table=source_table, + destination_table=s3_table, + ) + + +@TestScenario +def empty_table(self): + """Test exporting parts from an empty table.""" + + with Given("I create empty source and S3 tables"): + partitioned_merge_tree_table( + table_name="empty_source", + partition_by="p", + columns=default_columns(), + stop_merges=False, + populate=False, + ) + s3_table_name = create_s3_table(table_name="empty_s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts( + source_table="empty_source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read data from both tables"): + source_data = select_all_ordered( + table_name="empty_source", node=self.context.node + ) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + + with Then("They should be empty"): + assert source_data == [], error() + assert destination_data == [], error() + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) +def no_partition_by(self): + """Test exporting parts when the source table has no PARTITION BY type.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="tuple()", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table( + table_name="s3", create_new_bucket=True, partition_by="tuple()" + ) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +@Requirements(RQ_ClickHouse_ExportPart_PartTypes("1.0")) +def wide_and_compact_parts(self): + """Check that exporting with both wide and compact parts is supported.""" + + with Given("I create a source table with wide and compact parts"): + table_with_compact_and_wide_parts(table_name="source") + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestScenario +def large_export(self): + """Test exporting a large part.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + number_of_parts=100, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + +@TestFeature +@Name("sanity") +def feature(self): + """Check basic functionality of exporting data parts to S3 storage.""" + + # Scenario(run=empty_table) + Scenario(run=basic_table) + # Scenario(run=no_partition_by) + # Scenario(run=mismatched_columns) + # Scenario(run=wide_and_compact_parts) + # if self.context.stress: + # Scenario(run=large_export) + # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting diff --git a/s3/tests/export_partition/steps.py b/s3/tests/export_partition/steps.py new file mode 100644 index 000000000..2e3abb8b2 --- /dev/null +++ b/s3/tests/export_partition/steps.py @@ -0,0 +1,203 @@ +from testflows.core import * +from testflows.asserts import error +from helpers.common import getuid +from helpers.create import * +from helpers.queries import * +from s3.tests.common import temporary_bucket_path +import json + + +def default_columns(simple=True, partition_key_type="UInt8"): + columns = [ + {"name": "p", "type": partition_key_type}, + {"name": "i", "type": "UInt64"}, + {"name": "Path", "type": "String"}, + {"name": "Time", "type": "DateTime"}, + {"name": "Value", "type": "Float64"}, + {"name": "Timestamp", "type": "Int64"}, + ] + + if simple: + return columns[:2] + else: + return columns + + +def valid_partition_key_types_columns(): + return [ + {"name": "int8", "type": "Int8"}, + {"name": "int16", "type": "Int16"}, + {"name": "int32", "type": "Int32"}, + {"name": "int64", "type": "Int64"}, + {"name": "uint8", "type": "UInt8"}, + {"name": "uint16", "type": "UInt16"}, + {"name": "uint32", "type": "UInt32"}, + {"name": "uint64", "type": "UInt64"}, + {"name": "date", "type": "Date"}, + {"name": "date32", "type": "Date32"}, + {"name": "datetime", "type": "DateTime"}, + {"name": "datetime64", "type": "DateTime64"}, + {"name": "string", "type": "String"}, + {"name": "fixedstring", "type": "FixedString(10)"}, + ] + + +@TestStep(Given) +def create_temp_bucket(self): + """Create temporary S3 bucket.""" + + temp_s3_path = temporary_bucket_path( + bucket_prefix=f"{self.context.bucket_prefix}/export_part" + ) + + self.context.uri = f"{self.context.uri_base}export_part/{temp_s3_path}/" + + +@TestStep(Given) +def create_s3_table( + self, + table_name, + cluster=None, + create_new_bucket=False, + columns=None, + partition_by="p", +): + """Create a destination S3 table.""" + + if create_new_bucket: + create_temp_bucket() + + if columns is None: + columns = default_columns(simple=True) + + table_name = f"{table_name}_{getuid()}" + engine = f""" + S3( + '{self.context.uri}', + '{self.context.access_key_id}', + '{self.context.secret_access_key}', + filename='{table_name}', + format='Parquet', + compression='auto', + partition_strategy='hive' + ) + """ + + create_table( + table_name=table_name, + columns=columns, + partition_by=partition_by, + engine=engine, + cluster=cluster, + ) + + return table_name + + +@TestStep(When) +def get_parts(self, table_name, node): + """Get all parts for a table on a given node.""" + + output = node.query( + f"SELECT name FROM system.parts WHERE table = '{table_name}'", + exitcode=0, + steps=True, + ).output + return [row.strip() for row in output.splitlines()] + + +@TestStep(When) +def export_parts( + self, + source_table, + destination_table, + node, + parts=None, + exitcode=0, + explicit_set=1, +): + """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" + + if parts is None: + parts = get_parts(table_name=source_table, node=node) + + no_checks = exitcode != 0 + + output = [] + + for part in parts: + if explicit_set == 1: + output.append( + node.query( + f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + steps=True, + ) + ) + elif explicit_set == 0: + output.append( + node.query( + f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + settings=[("allow_experimental_export_merge_tree_part", 1)], + exitcode=exitcode, + no_checks=no_checks, + steps=True, + ) + ) + elif explicit_set == -1: + output.append( + node.query( + f"SET allow_experimental_export_merge_tree_part = 0; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + steps=True, + ) + ) + + return output + + +@TestStep(When) +def get_export_events(self, node): + """Get the export data from the system.events table of a given node.""" + + output = node.query( + "SELECT name, value FROM system.events WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + exitcode=0, + steps=True, + ).output + + events = {} + for line in output.strip().splitlines(): + event = json.loads(line) + events[event["name"]] = int(event["value"]) + + return events + + +@TestStep(When) +def drop_column(self, node, table_name, column_name): + """Drop a column from a table.""" + + node.query( + f"ALTER TABLE {table_name} DROP COLUMN {column_name}", exitcode=0, steps=True + ) + + +@TestStep(Then) +def source_matches_destination( + self, source_table, destination_table, source_node=None, destination_node=None +): + """Check that source and destination table data matches.""" + + if source_node is None: + source_node = self.context.node + if destination_node is None: + destination_node = self.context.node + + source_data = select_all_ordered(table_name=source_table, node=source_node) + destination_data = select_all_ordered( + table_name=destination_table, node=destination_node + ) + assert source_data == destination_data, error() diff --git a/s3/tests/export_partition/system_monitoring.py b/s3/tests/export_partition/system_monitoring.py new file mode 100644 index 000000000..51fdce972 --- /dev/null +++ b/s3/tests/export_partition/system_monitoring.py @@ -0,0 +1,101 @@ +from testflows.core import * +from testflows.asserts import error +from s3.tests.export_part.steps import * +from s3.requirements.export_part import * + + +# TODO +# part_log is where to look +# overwrite file +# max bandwidth +# some of system.events stuff wont appear unless i set this maybe? just a guess +# system.events +# Export row in system.metrics?? +# partsexports incrementing correctly +# duplicates incrementing correctly + + +@TestScenario +def part_exports(self): + """Check part exports are properly tracked in system.part_log.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I read the initial logged number of part exports"): + initial_exports = get_export_events( + node=self.context.node + ) # .get("PartsExports", 0) + note(f"Initial exports: {initial_exports}") + + # with When("I export parts to the S3 table"): + # export_parts( + # source_table="source", + # destination_table=s3_table_name, + # node=self.context.node, + # ) + + # with And("I read the final logged number of part exports"): + # final_exports = get_export_events(node=self.context.node).get("PartsExports", 0) + + # with Then("I check that the number of part exports is correct"): + + # with By("Reading the number of parts for the source table"): + # num_parts = len(get_parts(table_name="source", node=self.context.node)) + + # with And("Checking that the before and after difference is correct"): + # assert final_exports - initial_exports == num_parts, error() + + +@TestScenario +def duplicate_exports(self): + """Check duplicate exports are ignored and not exported again.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=True, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I try to export the parts twice"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + # with And("I read the initial export events"): + + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + + with And("Check logs for duplicate exports"): + export_events = get_export_events(node=self.context.node) + note(export_events["PartsExports"]) + + +@TestFeature +@Name("system monitoring") +@Requirements(RQ_ClickHouse_ExportPart_Logging("1.0")) +def feature(self): + """Check system monitoring of export events.""" + + # Scenario(run=part_exports) + Scenario(run=duplicate_exports) From c5a3e01b8df40493629ee01983824a6ece7e75e7 Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 5 Nov 2025 13:43:32 -0500 Subject: [PATCH 63/99] changes --- s3/tests/export_part/concurrency_networks.py | 74 ++++++++++---------- s3/tests/export_part/feature.py | 10 +-- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 2f99368f0..53a389135 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -335,24 +335,19 @@ def export_and_drop(self): columns=default_columns(), stop_merges=True, ) - # s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - # drop_column( - # node=self.context.node, - # table_name="source", - # column_name="i", - # ) - - # with When("I export data then drop a column"): - # export_parts( - # source_table="source", - # destination_table=s3_table_name, - # node=self.context.node, - # ) - # drop_column( - # node=self.context.node, - # table_name="source", - # column_name="i", - # ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export data then drop a column"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + drop_column( + node=self.context.node, + table_name="source", + column_name="i", + ) # This drop freezes the test ☠️☠️☠️ @@ -426,12 +421,14 @@ def restart_minio(self): with And("I restart MinIO"): start_minio() - - with Then("Check source matches destination"): - source_matches_destination( - source_table="source", - destination_table=s3_table_name, - ) + pause() + for retry in retries(timeout=30, delay=1): + with retry: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) @TestFeature @@ -442,17 +439,18 @@ def feature(self): # TODO corruption (bit flipping) - Scenario(test=basic_concurrent_export)(threads=5) - Scenario(test=packet_delay)(delay_ms=100) - Scenario(test=packet_loss)(percent_loss=50) - Scenario(test=packet_loss_gemodel)( - interruption_probability=40, recovery_probability=70 - ) - Scenario(test=packet_corruption)(percent_corrupt=50) - Scenario(test=packet_duplication)(percent_duplicated=50) - Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) - Scenario(test=packet_rate_limit)(rate_mbit=0.05) - Scenario(run=concurrent_insert) - Scenario(run=restart_minio) - - # Scenario(run=export_and_drop) + # Scenario(test=basic_concurrent_export)(threads=5) + # Scenario(test=packet_delay)(delay_ms=100) + # Scenario(test=packet_loss)(percent_loss=50) + # Scenario(test=packet_loss_gemodel)( + # interruption_probability=40, recovery_probability=70 + # ) + # Scenario(test=packet_corruption)(percent_corrupt=50) + # Scenario(test=packet_duplication)(percent_duplicated=50) + # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + # Scenario(test=packet_rate_limit)(rate_mbit=0.05) + # Scenario(run=concurrent_insert) + + # Scenario(run=restart_minio) + + Scenario(run=export_and_drop) diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 783e661e5..eaa39c05d 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -16,10 +16,10 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix - Feature(run=load("s3.tests.export_part.sanity", "feature")) - Feature(run=load("s3.tests.export_part.error_handling", "feature")) - Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) - Feature(run=load("s3.tests.export_part.datatypes", "feature")) + # Feature(run=load("s3.tests.export_part.sanity", "feature")) + # Feature(run=load("s3.tests.export_part.error_handling", "feature")) + # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + # Feature(run=load("s3.tests.export_part.datatypes", "feature")) Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) From 1bb65dd08a482c6da08df1dcd1b14e97b4aa9b6a Mon Sep 17 00:00:00 2001 From: vzakaznikov <41681088+vzakaznikov@users.noreply.github.com> Date: Wed, 5 Nov 2025 14:03:06 -0500 Subject: [PATCH 64/99] Update steps.py --- s3/tests/export_part/steps.py | 49 +++++++++++++---------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 2e3abb8b2..6047fe1fb 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -1,11 +1,13 @@ +import json + from testflows.core import * from testflows.asserts import error from helpers.common import getuid from helpers.create import * from helpers.queries import * from s3.tests.common import temporary_bucket_path -import json +default_settings = [("allow_experimental_export_merge_tree_part", 1)] def default_columns(simple=True, partition_key_type="UInt8"): columns = [ @@ -114,46 +116,31 @@ def export_parts( node, parts=None, exitcode=0, - explicit_set=1, + settings=None, + inline_settings=True ): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" if parts is None: parts = get_parts(table_name=source_table, node=node) + if inline_settings is True: + inline_settings = default_settings + no_checks = exitcode != 0 - output = [] - + for part in parts: - if explicit_set == 1: - output.append( - node.query( - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) - ) - elif explicit_set == 0: - output.append( - node.query( - f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) - ) - elif explicit_set == -1: - output.append( - node.query( - f"SET allow_experimental_export_merge_tree_part = 0; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) + output.append( + node.query( + f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + steps=True, + settings=settings, + inline_settings=inline_settings, ) + ) return output From e3138149f61e2e94e1310fd98158081e0bf0b784 Mon Sep 17 00:00:00 2001 From: vzakaznikov <41681088+vzakaznikov@users.noreply.github.com> Date: Wed, 5 Nov 2025 14:11:27 -0500 Subject: [PATCH 65/99] Update cluster.py --- helpers/cluster.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/helpers/cluster.py b/helpers/cluster.py index 3c70861c8..e3b976846 100755 --- a/helpers/cluster.py +++ b/helpers/cluster.py @@ -1012,6 +1012,7 @@ def query( ignore_exception=False, step=By, settings=None, + inline_settings=None, retry_count=5, messages_to_retry=None, retry_delay=5, @@ -1035,6 +1036,7 @@ def query( :param no_check: disable exitcode and message checks, default: False :param step: wrapping step class, default: By :param settings: list of settings to be used for the query in the form [(name, value),...], default: None + :param inline_settings: list of inline settings to be used for the query in the form [(name, value),...], default: None :param retry_count: number of retries, default: 5 :param messages_to_retry: list of messages in the query output for which retry should be triggered, default: MESSAGES_TO_RETRY @@ -1057,6 +1059,7 @@ def query( retry_count = max(0, int(retry_count)) retry_delay = max(0, float(retry_delay)) settings = list(settings or []) + inline_settings = list(inline_settings or []) query_settings = list(settings) if raise_on_exception: @@ -1077,6 +1080,9 @@ def query( if query_id is not None: query_settings += [("query_id", f"{query_id}")] + if inline_settings: + sql = "; ".join([f"SET {name} = {value}" for name, value in inline_settings]) + sql + client = "clickhouse client -n" if secure: client += ( From 0a69b76f102c3280e26efd6e29f48990960c859c Mon Sep 17 00:00:00 2001 From: julian Date: Wed, 5 Nov 2025 19:14:54 -0500 Subject: [PATCH 66/99] clickhouse node and minio network interruptions, safe TERM and unsafe KILL --- s3/tests/export_part/concurrency_networks.py | 126 ++++++++++++------- s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 40 ++++++ s3/tests/export_part/steps.py | 7 ++ 4 files changed, 129 insertions(+), 46 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 53a389135..336ed3941 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -324,33 +324,6 @@ def concurrent_insert(self): assert len(source_data) == 15, error() -@TestScenario -def export_and_drop(self): - """Check that dropping a column immediately after export works correctly.""" - pause() - with Given("I create a populated source table and empty S3 table"): - partitioned_merge_tree_table( - table_name="source", - partition_by="p", - columns=default_columns(), - stop_merges=True, - ) - s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - - with When("I export data then drop a column"): - export_parts( - source_table="source", - destination_table=s3_table_name, - node=self.context.node, - ) - drop_column( - node=self.context.node, - table_name="source", - column_name="i", - ) - # This drop freezes the test ☠️☠️☠️ - - @TestStep(When) def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): """Forcefully kill MinIO container to simulate network crash.""" @@ -366,6 +339,21 @@ def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KIL steps=False, ) + if signal == "TERM": + with And("waiting for MinIO container to stop"): + for attempt in retries(timeout=30, delay=1): + with attempt: + result = cluster.command( + None, + f"docker ps --filter name={container_name} --format '{{{{.Names}}}}'", + timeout=10, + steps=False, + no_checks=True, + ) + if container_name not in result.output: + break + fail("MinIO container still running") + @TestStep(When) def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): @@ -398,37 +386,85 @@ def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=30 @TestScenario -def restart_minio(self): - """Check that restarting MinIO after exporting data works correctly.""" +def minio_network_interruption(self, number_of_values=3, signal="KILL"): + """Check that restarting MinIO while exporting parts inbetween works correctly.""" with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", partition_by="p", columns=default_columns(), + number_of_values=number_of_values, ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - with And("I kill MinIO"): - kill_minio() + with And("I stop MinIO"): + kill_minio(signal=signal) + + with When("I read export events"): + initial_events = get_export_events(node=self.context.node) - with When("I export data"): + with And("I export data"): export_parts( source_table="source", destination_table=s3_table_name, node=self.context.node, ) - with And("I restart MinIO"): + with And("I start MinIO"): start_minio() - pause() - for retry in retries(timeout=30, delay=1): - with retry: - with Then("Check source matches destination"): - source_matches_destination( - source_table="source", - destination_table=s3_table_name, - ) + + with Then("Destination data should be a subset of source data"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + assert set(source_data) >= set(destination_data), error() + + with And("Failed exports should be logged in the system.events table"): + final_events = get_export_events(node=self.context.node) + assert final_events["PartsExportFailures"] - initial_events["PartsExportFailures"] == (len(source_data) - len(destination_data)) / number_of_values, error() + + +@TestScenario +def clickhouse_network_interruption(self, safe=False): + """Check that exports work correctly with a clickhouse network outage.""" + + with Given("I create a populated source table and empty S3 table"): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I get parts before the interruption"): + parts = get_parts(table_name="source", node=self.context.node) + + with When("I queue exports and restart the node in parallel"): + Step(test=export_parts, parallel=True)( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + exitcode=1, + parts=parts, + ) + self.context.node.restart(safe=safe) + join() + + if safe: + with Then("Check source matches destination"): + source_matches_destination( + source_table="source", + destination_table=s3_table_name, + ) + else: + with Then("Destination data should be a subset of source data"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + assert set(source_data) >= set(destination_data), error() @TestFeature @@ -450,7 +486,7 @@ def feature(self): # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) # Scenario(test=packet_rate_limit)(rate_mbit=0.05) # Scenario(run=concurrent_insert) - - # Scenario(run=restart_minio) - - Scenario(run=export_and_drop) + Scenario(test=minio_network_interruption)(signal="TERM") + Scenario(test=minio_network_interruption)(signal="KILL") + Scenario(test=clickhouse_network_interruption)(safe=True) + Scenario(test=clickhouse_network_interruption)(safe=False) \ No newline at end of file diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index eaa39c05d..60bd0657e 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -9,7 +9,7 @@ @Requirements( RQ_ClickHouse_ExportPart_S3("1.0"), ) -@Name("export parts") +@Name("export part") def minio(self, uri, bucket_prefix): """Run features from the export parts suite.""" diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 15777e6d7..c56846728 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -203,6 +203,45 @@ def wide_and_compact_parts(self): ) +@TestScenario +def export_and_drop(self): + """Check that dropping a column immediately after export doesn't affect exported data.""" + + with Given("I create a populated source table and empty S3 table", description=""" + Stop merges must be false to allow for mutations like dropping a column. + """): + partitioned_merge_tree_table( + table_name="source", + partition_by="p", + columns=default_columns(), + stop_merges=False, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export data"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) + + with And("I read the source before dropping a column"): + source_data = select_all_ordered(table_name="source", node=self.context.node) + + with And("I drop a source column"): + drop_column( + node=self.context.node, + table_name="source", + column_name="i", + ) + + with Then("Check source before drop matches destination"): + destination_data = select_all_ordered( + table_name=s3_table_name, node=self.context.node + ) + assert source_data == destination_data, error() + + @TestScenario def large_export(self): """Test exporting a large part.""" @@ -241,6 +280,7 @@ def feature(self): Scenario(run=no_partition_by) Scenario(run=mismatched_columns) Scenario(run=wide_and_compact_parts) + Scenario(run=export_and_drop) if self.context.stress: Scenario(run=large_export) # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 6047fe1fb..c2f0fc196 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -160,6 +160,13 @@ def get_export_events(self, node): event = json.loads(line) events[event["name"]] = int(event["value"]) + if "PartsExportFailures" not in events: + events["PartsExportFailures"] = 0 + if "PartsExports" not in events: + events["PartsExports"] = 0 + if "PartsExportDuplicated" not in events: + events["PartsExportDuplicated"] = 0 + return events From 1fc939c387d01f1c8ce773f73d92f0136cb24c25 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 11:12:43 -0500 Subject: [PATCH 67/99] Working suite --- helpers/cluster.py | 2 +- helpers/create.py | 2 ++ s3/tests/export_part/concurrency_networks.py | 22 ++++++++++---------- s3/tests/export_part/error_handling.py | 2 +- s3/tests/export_part/feature.py | 11 +++++----- s3/tests/export_part/sanity.py | 13 +++++++----- s3/tests/export_part/steps.py | 3 +-- 7 files changed, 30 insertions(+), 25 deletions(-) diff --git a/helpers/cluster.py b/helpers/cluster.py index e3b976846..71f1e4f12 100755 --- a/helpers/cluster.py +++ b/helpers/cluster.py @@ -1081,7 +1081,7 @@ def query( query_settings += [("query_id", f"{query_id}")] if inline_settings: - sql = "; ".join([f"SET {name} = {value}" for name, value in inline_settings]) + sql + sql = "; ".join([f"SET {name} = {value}" for name, value in inline_settings]) + "; " + sql client = "clickhouse client -n" if secure: diff --git a/helpers/create.py b/helpers/create.py index d07d36937..b085821f6 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -408,6 +408,7 @@ def partitioned_merge_tree_table( populate=True, number_of_partitions=5, number_of_parts=1, + number_of_values=3, query_settings=None, ): """Create a MergeTree table partitioned by a specific column.""" @@ -427,6 +428,7 @@ def partitioned_merge_tree_table( table_name=table_name, number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, + number_of_values=number_of_values, ) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 336ed3941..d6b4cd9ed 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -475,17 +475,17 @@ def feature(self): # TODO corruption (bit flipping) - # Scenario(test=basic_concurrent_export)(threads=5) - # Scenario(test=packet_delay)(delay_ms=100) - # Scenario(test=packet_loss)(percent_loss=50) - # Scenario(test=packet_loss_gemodel)( - # interruption_probability=40, recovery_probability=70 - # ) - # Scenario(test=packet_corruption)(percent_corrupt=50) - # Scenario(test=packet_duplication)(percent_duplicated=50) - # Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) - # Scenario(test=packet_rate_limit)(rate_mbit=0.05) - # Scenario(run=concurrent_insert) + Scenario(test=basic_concurrent_export)(threads=5) + Scenario(test=packet_delay)(delay_ms=100) + Scenario(test=packet_loss)(percent_loss=50) + Scenario(test=packet_loss_gemodel)( + interruption_probability=40, recovery_probability=70 + ) + Scenario(test=packet_corruption)(percent_corrupt=50) + Scenario(test=packet_duplication)(percent_duplicated=50) + Scenario(test=packet_reordering)(delay_ms=100, percent_reordered=90) + Scenario(test=packet_rate_limit)(rate_mbit=0.05) + Scenario(run=concurrent_insert) Scenario(test=minio_network_interruption)(signal="TERM") Scenario(test=minio_network_interruption)(signal="KILL") Scenario(test=clickhouse_network_interruption)(safe=True) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index e40221bac..b4da3cb31 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -123,7 +123,7 @@ def disable_export_setting(self): destination_table=s3_table_name, node=self.context.node, exitcode=1, - explicit_set=-1, + inline_settings=[("allow_experimental_export_merge_tree_part", 0)], ) with Then("I should see an error related to the export setting"): diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 60bd0657e..800858ded 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -15,11 +15,12 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix + self.context.default_settings = [("allow_experimental_export_merge_tree_part", 1)] - # Feature(run=load("s3.tests.export_part.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) + Feature(run=load("s3.tests.export_part.sanity", "feature")) + Feature(run=load("s3.tests.export_part.error_handling", "feature")) + Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) + Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) + Feature(run=load("s3.tests.export_part.datatypes", "feature")) Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index c56846728..2131102d3 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -29,7 +29,7 @@ def export_setting(self): source_table="source", destination_table=s3_table_name1, node=self.context.node, - explicit_set=1, + inline_settings=True, ) with And("I export parts to the second S3 table using the settings argument"): @@ -37,7 +37,8 @@ def export_setting(self): source_table="source", destination_table=s3_table_name2, node=self.context.node, - explicit_set=0, + inline_settings=False, + settings=self.context.default_settings, ) with And("I read data from all tables"): @@ -243,7 +244,7 @@ def export_and_drop(self): @TestScenario -def large_export(self): +def large_part(self): """Test exporting a large part.""" with Given("I create a populated source table and empty S3 table"): @@ -252,7 +253,9 @@ def large_export(self): partition_by="p", columns=default_columns(), stop_merges=True, - number_of_parts=100, + number_of_values=100000000, + number_of_parts=1, + number_of_partitions=1, ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) @@ -282,5 +285,5 @@ def feature(self): Scenario(run=wide_and_compact_parts) Scenario(run=export_and_drop) if self.context.stress: - Scenario(run=large_export) + Scenario(run=large_part) # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index c2f0fc196..e01788ed7 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -7,7 +7,6 @@ from helpers.queries import * from s3.tests.common import temporary_bucket_path -default_settings = [("allow_experimental_export_merge_tree_part", 1)] def default_columns(simple=True, partition_key_type="UInt8"): columns = [ @@ -125,7 +124,7 @@ def export_parts( parts = get_parts(table_name=source_table, node=node) if inline_settings is True: - inline_settings = default_settings + inline_settings = self.context.default_settings no_checks = exitcode != 0 output = [] From ee0f4ecaf11367d820a43f672eb4617aa4131efe Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 20:51:22 +0400 Subject: [PATCH 68/99] change step from And to By --- engines/tests/replacing_merge_tree/replacing_merge_tree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engines/tests/replacing_merge_tree/replacing_merge_tree.py b/engines/tests/replacing_merge_tree/replacing_merge_tree.py index 495dcc10e..cc4f3929e 100644 --- a/engines/tests/replacing_merge_tree/replacing_merge_tree.py +++ b/engines/tests/replacing_merge_tree/replacing_merge_tree.py @@ -533,7 +533,7 @@ def incorrect_data_insert_with_disabled_optimize_on_insert(self, node=None): ): node.query(f"SELECT * FROM {name} FORMAT JSONEachRow;") - with And("I optimize table"): + with By("optimizing a table"): node.query( f"OPTIMIZE TABLE {name} FINAL;", message="DB::Exception:", From 5497bd727677d4e83ea94ce052bff095cf1ab9af Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 12:40:27 -0500 Subject: [PATCH 69/99] Requirements update --- s3/requirements/export_part.md | 288 ++++--- s3/requirements/export_part.py | 766 +++++++++++++------ s3/tests/export_part/clusters_nodes.py | 2 + s3/tests/export_part/concurrency_networks.py | 12 +- s3/tests/export_part/datatypes.py | 1 - s3/tests/export_part/engines_volumes.py | 6 +- s3/tests/export_part/error_handling.py | 3 + s3/tests/export_part/sanity.py | 3 + 8 files changed, 719 insertions(+), 362 deletions(-) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index 218003e40..eac61e0d2 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -6,59 +6,69 @@ * 1 [Introduction](#introduction) * 2 [Exporting Parts to S3](#exporting-parts-to-s3) * 2.1 [RQ.ClickHouse.ExportPart.S3](#rqclickhouseexportparts3) + * 2.2 [RQ.ClickHouse.ExportPart.EmptyTable](#rqclickhouseexportpartemptytable) * 3 [SQL command support](#sql-command-support) * 3.1 [RQ.ClickHouse.ExportPart.SQLCommand](#rqclickhouseexportpartsqlcommand) * 4 [Supported source table engines](#supported-source-table-engines) * 4.1 [RQ.ClickHouse.ExportPart.SourceEngines](#rqclickhouseexportpartsourceengines) -* 5 [Supported source part storage types](#supported-source-part-storage-types) - * 5.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) -* 6 [Supported destination table engines](#supported-destination-table-engines) - * 6.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) -* 7 [Destination setup and file management](#destination-setup-and-file-management) - * 7.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) -* 8 [Export data preparation](#export-data-preparation) - * 8.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) -* 9 [Schema compatibility](#schema-compatibility) - * 9.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) -* 10 [Partition key types support](#partition-key-types-support) - * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) -* 11 [Part types and content support](#part-types-and-content-support) - * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) -* 12 [Export operation failure handling](#export-operation-failure-handling) - * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) -* 13 [Export operation restrictions](#export-operation-restrictions) - * 13.1 [Preventing same table exports](#preventing-same-table-exports) - * 13.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) - * 13.2 [Destination table compatibility](#destination-table-compatibility) - * 13.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) - * 13.3 [Source part availability](#source-part-availability) - * 13.3.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) -* 14 [Export operation concurrency](#export-operation-concurrency) - * 14.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) -* 15 [Export operation idempotency](#export-operation-idempotency) - * 15.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) -* 16 [Export operation error recovery](#export-operation-error-recovery) - * 16.1 [Graceful failure handling](#graceful-failure-handling) - * 16.1.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure](#rqclickhouseexportparterrorrecoverygracefulfailure) - * 16.2 [Automatic cleanup on failure](#automatic-cleanup-on-failure) - * 16.2.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup](#rqclickhouseexportparterrorrecoveryautomaticcleanup) -* 17 [Export operation logging](#export-operation-logging) - * 17.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) -* 18 [Monitoring export operations](#monitoring-export-operations) - * 18.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) -* 19 [Enabling export functionality](#enabling-export-functionality) - * 19.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) -* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 20.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) -* 21 [Export operation configuration](#export-operation-configuration) - * 21.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) -* 22 [Controlling export performance](#controlling-export-performance) - * 22.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) -* 23 [Monitoring export performance metrics](#monitoring-export-performance-metrics) - * 23.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) - * 23.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) -* 24 [Export operation security](#export-operation-security) - * 24.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) +* 5 [Cluster and node support](#cluster-and-node-support) + * 5.1 [RQ.ClickHouse.ExportPart.ClustersNodes](#rqclickhouseexportpartclustersnodes) +* 6 [Supported source part storage types](#supported-source-part-storage-types) + * 6.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) +* 7 [Storage policies and volumes](#storage-policies-and-volumes) + * 7.1 [RQ.ClickHouse.ExportPart.StoragePolicies](#rqclickhouseexportpartstoragepolicies) +* 8 [Supported destination table engines](#supported-destination-table-engines) + * 8.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) +* 9 [Destination setup and file management](#destination-setup-and-file-management) + * 9.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) +* 10 [Export data preparation](#export-data-preparation) + * 10.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) +* 11 [Schema compatibility](#schema-compatibility) + * 11.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) +* 12 [Partition key types support](#partition-key-types-support) + * 12.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) +* 13 [Part types and content support](#part-types-and-content-support) + * 13.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) + * 13.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) + * 13.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) +* 14 [Export operation failure handling](#export-operation-failure-handling) + * 14.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) +* 15 [Network resilience](#network-resilience) + * 15.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) + * 15.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) + * 15.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) +* 16 [Export operation restrictions](#export-operation-restrictions) + * 16.1 [Preventing same table exports](#preventing-same-table-exports) + * 16.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) + * 16.2 [Destination table compatibility](#destination-table-compatibility) + * 16.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) + * 16.3 [Local table restriction](#local-table-restriction) + * 16.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) + * 16.4 [Partition key compatibility](#partition-key-compatibility) + * 16.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) + * 16.5 [Source part availability](#source-part-availability) + * 16.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) +* 17 [Export operation concurrency](#export-operation-concurrency) + * 17.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) +* 18 [Export operation idempotency](#export-operation-idempotency) + * 18.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) +* 19 [Export operation logging](#export-operation-logging) + * 19.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) +* 20 [Monitoring export operations](#monitoring-export-operations) + * 20.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) +* 21 [Enabling export functionality](#enabling-export-functionality) + * 21.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) +* 22 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 22.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) +* 23 [Export operation configuration](#export-operation-configuration) + * 23.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) +* 24 [Controlling export performance](#controlling-export-performance) + * 24.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) +* 25 [Monitoring export performance metrics](#monitoring-export-performance-metrics) + * 25.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) + * 25.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) +* 26 [Export operation security](#export-operation-security) + * 26.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) ## Introduction @@ -71,6 +81,15 @@ version: 1.0 [ClickHouse] SHALL support exporting data parts from MergeTree engine tables to S3 object storage. +### RQ.ClickHouse.ExportPart.EmptyTable +version: 1.0 + +[ClickHouse] SHALL support exporting from empty tables by: +* Completing export operations successfully when the source table contains no parts +* Resulting in an empty destination table when exporting from an empty source table +* Not creating any files in destination storage when there are no parts to export +* Handling empty tables gracefully without errors + ## SQL command support ### RQ.ClickHouse.ExportPart.SQLCommand @@ -104,6 +123,16 @@ version: 1.0 * `GraphiteMergeTree` - MergeTree optimized for Graphite data * All other MergeTree family engines that inherit from `MergeTreeData` +## Cluster and node support + +### RQ.ClickHouse.ExportPart.ClustersNodes +version: 1.0 + +[ClickHouse] SHALL support exporting parts from multiple nodes in a cluster to the same destination storage, ensuring that: +* Each node can independently export parts from its local storage to the shared destination +* Exported data from different nodes is correctly aggregated in the destination +* All nodes in the cluster can read the same exported data from the destination + ## Supported source part storage types ### RQ.ClickHouse.ExportPart.SourcePartStorage @@ -118,6 +147,19 @@ version: 1.0 * **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold) * **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled +## Storage policies and volumes + +### RQ.ClickHouse.ExportPart.StoragePolicies +version: 1.0 + +[ClickHouse] SHALL support exporting parts from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including: +* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks +* **External Volumes**: Volumes using external storage systems +* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers +* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks +* Exporting parts regardless of which volume or disk within the storage policy contains the part +* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy + ## Supported destination table engines ### RQ.ClickHouse.ExportPart.DestinationEngines @@ -141,7 +183,7 @@ version: 1.0 * Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts * Allowing destination storage to determine the final file path based on Hive partitioning * Creating files in the destination storage that users can observe and access -* Providing the final destination file path in the `system.exports` table for monitoring +* Providing the final destination file path in the `system.part_log` table for monitoring ## Export data preparation @@ -193,33 +235,26 @@ version: 1.0 | **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | | **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | -[ClickHouse] SHALL handle all special columns and metadata present in parts during export: - -| Column Type | Supported | Description | Export Behavior | -|-------------|------------|-------------|-----------------| -| **Physical Columns** | ✅ Yes | User-defined table columns | All physical columns exported | -| **RowExistsColumn (_row_exists)** | ✅ Yes | Lightweight delete mask showing row existence | Exported to maintain delete state | -| **BlockNumberColumn (_block_number)** | ✅ Yes | Original block number from insert | Exported for row identification | -| **BlockOffsetColumn (_block_offset)** | ✅ Yes | Original row offset within block | Exported for row identification | -| **PartDataVersionColumn (_part_data_version)** | ✅ Yes | Data version for mutations | Exported for version tracking | -| **Virtual Columns** | ✅ Yes | Runtime columns like _part, _partition_id | Generated during export | -| **System Metadata** | ✅ Yes | Checksums, compression info, serialization | Preserved in export | - -[ClickHouse] SHALL handle all mutation and schema change information present in parts: - -| Mutation/Schema Type | Supported | Description | Export Behavior | -|---------------------|------------|-------------|-----------------| -| **Mutation Commands** | ✅ Yes | DELETE, UPDATE, MATERIALIZE_INDEX, DROP_COLUMN, RENAME_COLUMN | Applied during export | -| **Alter Conversions** | ✅ Yes | Column renames, type changes, schema modifications | Applied during export | -| **Patch Parts** | ✅ Yes | Lightweight updates with only changed columns | Applied during export | -| **Mutation Versions** | ✅ Yes | Version tracking for applied mutations | Preserved in export | -| **Schema Changes** | ✅ Yes | ALTER MODIFY, ALTER DROP, ALTER RENAME | Applied during export | -| **TTL Information** | ✅ Yes | Time-to-live settings and expiration data | Preserved in export | -| **Index Information** | ✅ Yes | Primary key, secondary indices, projections | Preserved in export | -| **Statistics** | ✅ Yes | Column statistics and sampling information | Preserved in export | - [ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage. +### RQ.ClickHouse.ExportPart.SchemaChangeIsolation +version: 1.0 + +[ClickHouse] SHALL ensure exported data is isolated from subsequent schema changes by: +* Preserving exported data exactly as it was at the time of export +* Not being affected by schema changes (column drops, renames, type changes) that occur after export +* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export +* Ensuring exported data reflects the source table state at the time of export, not the current state + +### RQ.ClickHouse.ExportPart.LargeParts +version: 1.0 + +[ClickHouse] SHALL support exporting large parts by: +* Handling parts with large numbers of rows (e.g., 100 million or more) +* Processing large data volumes efficiently during export +* Maintaining data integrity when exporting large parts +* Completing export operations successfully regardless of part size + ## Export operation failure handling ### RQ.ClickHouse.ExportPart.FailureHandling @@ -232,6 +267,40 @@ version: 1.0 * **Simple Failure**: Export operations either succeed completely or fail with an error message * **No Partial Exports**: Failed exports leave no partial or corrupted data in destination storage +## Network resilience + +### RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues +version: 1.0 + +[ClickHouse] SHALL handle network packet issues during export operations by: +* Tolerating packet delay without data corruption or loss +* Handling packet loss and retransmitting data as needed +* Detecting and handling packet corruption to ensure data integrity +* Managing packet duplication without data duplication in destination +* Handling packet reordering to maintain correct data sequence +* Operating correctly under packet rate limiting constraints +* Completing exports successfully despite network impairments + +### RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption +version: 1.0 + +[ClickHouse] SHALL handle destination storage interruptions during export operations by: +* Detecting when destination storage becomes unavailable during export +* Failing export operations gracefully when destination storage is unavailable +* Logging failed exports in the `system.events` table with `PartsExportFailures` counter +* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability +* Allowing exports to complete successfully once destination storage becomes available again + +### RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption +version: 1.0 + +[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by: +* Handling node restarts gracefully during export operations +* Not leaving partial or corrupted data in destination storage when node restarts occur +* With safe shutdown, ensuring exports complete successfully before node shutdown +* With unsafe shutdown, allowing partial exports to complete successfully after node restart +* Maintaining data integrity in destination storage regardless of node interruption type + ## Export operation restrictions ### Preventing same table exports @@ -256,6 +325,26 @@ version: 1.0 * Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met * Performing this validation during the initial export setup phase +### Local table restriction + +#### RQ.ClickHouse.ExportPart.Restrictions.LocalTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting parts to local MergeTree tables by: +* Rejecting export operations where the destination table uses a MergeTree engine +* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table +* Performing this validation during the initial export setup phase + +### Partition key compatibility + +#### RQ.ClickHouse.ExportPart.Restrictions.PartitionKey +version: 1.0 + +[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by: +* Checking that the partition key expression matches between source and destination tables +* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ +* Performing this validation during the initial export setup phase + ### Source part availability #### RQ.ClickHouse.ExportPart.Restrictions.SourcePart @@ -295,31 +384,6 @@ version: 1.0 * Generating unique file names using part name and checksum to avoid conflicts * Maintaining export state consistency across retries -## Export operation error recovery - -### Graceful failure handling - -#### RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure -version: 1.0 - -[ClickHouse] SHALL handle export failures gracefully by: -* Allowing users to retry failed export operations -* Maintaining system stability even when exports fail -* Not corrupting source data when export operations fail -* Continuing to process other export operations when one fails - -### Automatic cleanup on failure - -#### RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup -version: 1.0 - -[ClickHouse] SHALL automatically clean up failed export operations by: -* Removing export manifests from the system when operations fail -* Cleaning up any partial data written to destination storage -* Releasing system resources (memory, file handles) used by failed exports -* Updating export status to reflect the failure state -* Allowing the system to recover and process other export operations - ## Export operation logging ### RQ.ClickHouse.ExportPart.Logging @@ -330,14 +394,31 @@ version: 1.0 * Recording the specific part name and destination for all operations * Including execution time and progress information for all operations * Writing operation information to the `system.part_log` table with the following columns: + * `hostname` - Hostname of the server where the export operation occurred + * `query_id` - Query ID of the export operation * `event_type` - Set to `EXPORT_PART` for export operations + * `event_date` - Date when the export operation occurred * `event_time` - Timestamp when the export operation occurred + * `event_time_microseconds` - Timestamp with microsecond precision + * `duration_ms` - Execution time in milliseconds + * `database` - Source database name * `table` - Source table name + * `table_uuid` - UUID of the source table * `part_name` - Name of the part being exported + * `partition_id` - Partition ID of the part being exported + * `partition` - Partition name of the part being exported + * `part_type` - Type of the part (e.g., Wide, Compact) + * `disk_name` - Name of the disk where the part is stored * `path_on_disk` - Path to the part in source storage - * `duration_ms` - Execution time in milliseconds + * `rows` - Number of rows in the part + * `size_in_bytes` - Size of the part in bytes + * `bytes_uncompressed` - Uncompressed size of the part in bytes + * `read_rows` - Number of rows read during export + * `read_bytes` - Number of bytes read during export + * `peak_memory_usage` - Peak memory usage during the export operation * `error` - Error message if the export failed (empty for successful exports) - * `thread_id` - Thread ID performing the export + * `exception` - Exception details if the export failed + * `ProfileEvents` - Profile events collected during the export operation * Providing sufficient detail for monitoring and troubleshooting export operations ## Monitoring export operations @@ -399,9 +480,6 @@ version: 1.0 * `PartsExports` - Number of successful part exports * `PartsExportFailures` - Number of failed part exports * `PartsExportDuplicated` - Number of part exports that failed because target already exists -* `PartsExportTotalMilliseconds` - Total time spent on part export operations in milliseconds -* `ExportsThrottlerBytes` - Bytes passed through the exports throttler -* `ExportsThrottlerSleepMicroseconds` - Total time queries were sleeping to conform to export bandwidth throttling ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index 722432bfc..f6072746d 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -24,6 +24,26 @@ num="2.1", ) +RQ_ClickHouse_ExportPart_EmptyTable = Requirement( + name="RQ.ClickHouse.ExportPart.EmptyTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting from empty tables by:\n" + "* Completing export operations successfully when the source table contains no parts\n" + "* Resulting in an empty destination table when exporting from an empty source table\n" + "* Not creating any files in destination storage when there are no parts to export\n" + "* Handling empty tables gracefully without errors\n" + "\n" + ), + link=None, + level=2, + num="2.2", +) + RQ_ClickHouse_ExportPart_SQLCommand = Requirement( name="RQ.ClickHouse.ExportPart.SQLCommand", version="1.0", @@ -75,6 +95,25 @@ num="4.1", ) +RQ_ClickHouse_ExportPart_ClustersNodes = Requirement( + name="RQ.ClickHouse.ExportPart.ClustersNodes", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting parts from multiple nodes in a cluster to the same destination storage, ensuring that:\n" + "* Each node can independently export parts from its local storage to the shared destination\n" + "* Exported data from different nodes is correctly aggregated in the destination\n" + "* All nodes in the cluster can read the same exported data from the destination\n" + "\n" + ), + link=None, + level=2, + num="5.1", +) + RQ_ClickHouse_ExportPart_SourcePartStorage = Requirement( name="RQ.ClickHouse.ExportPart.SourcePartStorage", version="1.0", @@ -95,7 +134,29 @@ ), link=None, level=2, - num="5.1", + num="6.1", +) + +RQ_ClickHouse_ExportPart_StoragePolicies = Requirement( + name="RQ.ClickHouse.ExportPart.StoragePolicies", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting parts from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including:\n" + "* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks\n" + "* **External Volumes**: Volumes using external storage systems\n" + "* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers\n" + "* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks\n" + "* Exporting parts regardless of which volume or disk within the storage policy contains the part\n" + "* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy\n" + "\n" + ), + link=None, + level=2, + num="7.1", ) RQ_ClickHouse_ExportPart_DestinationEngines = Requirement( @@ -117,7 +178,7 @@ ), link=None, level=2, - num="6.1", + num="8.1", ) RQ_ClickHouse_ExportPart_DestinationSetup = Requirement( @@ -133,12 +194,12 @@ "* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts\n" "* Allowing destination storage to determine the final file path based on Hive partitioning\n" "* Creating files in the destination storage that users can observe and access\n" - "* Providing the final destination file path in the `system.exports` table for monitoring\n" + "* Providing the final destination file path in the `system.part_log` table for monitoring\n" "\n" ), link=None, level=2, - num="7.1", + num="9.1", ) RQ_ClickHouse_ExportPart_DataPreparation = Requirement( @@ -156,7 +217,7 @@ ), link=None, level=2, - num="8.1", + num="10.1", ) RQ_ClickHouse_ExportPart_SchemaCompatibility = Requirement( @@ -176,7 +237,7 @@ ), link=None, level=2, - num="9.1", + num="11.1", ) RQ_ClickHouse_ExportPart_PartitionKeyTypes = Requirement( @@ -203,7 +264,7 @@ ), link=None, level=2, - num="10.1", + num="12.1", ) RQ_ClickHouse_ExportPart_PartTypes = Requirement( @@ -221,37 +282,52 @@ "| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts |\n" "| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts |\n" "\n" - "[ClickHouse] SHALL handle all special columns and metadata present in parts during export:\n" - "\n" - "| Column Type | Supported | Description | Export Behavior |\n" - "|-------------|------------|-------------|-----------------|\n" - "| **Physical Columns** | ✅ Yes | User-defined table columns | All physical columns exported |\n" - "| **RowExistsColumn (_row_exists)** | ✅ Yes | Lightweight delete mask showing row existence | Exported to maintain delete state |\n" - "| **BlockNumberColumn (_block_number)** | ✅ Yes | Original block number from insert | Exported for row identification |\n" - "| **BlockOffsetColumn (_block_offset)** | ✅ Yes | Original row offset within block | Exported for row identification |\n" - "| **PartDataVersionColumn (_part_data_version)** | ✅ Yes | Data version for mutations | Exported for version tracking |\n" - "| **Virtual Columns** | ✅ Yes | Runtime columns like _part, _partition_id | Generated during export |\n" - "| **System Metadata** | ✅ Yes | Checksums, compression info, serialization | Preserved in export |\n" - "\n" - "[ClickHouse] SHALL handle all mutation and schema change information present in parts:\n" + "[ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage.\n" "\n" - "| Mutation/Schema Type | Supported | Description | Export Behavior |\n" - "|---------------------|------------|-------------|-----------------|\n" - "| **Mutation Commands** | ✅ Yes | DELETE, UPDATE, MATERIALIZE_INDEX, DROP_COLUMN, RENAME_COLUMN | Applied during export |\n" - "| **Alter Conversions** | ✅ Yes | Column renames, type changes, schema modifications | Applied during export |\n" - "| **Patch Parts** | ✅ Yes | Lightweight updates with only changed columns | Applied during export |\n" - "| **Mutation Versions** | ✅ Yes | Version tracking for applied mutations | Preserved in export |\n" - "| **Schema Changes** | ✅ Yes | ALTER MODIFY, ALTER DROP, ALTER RENAME | Applied during export |\n" - "| **TTL Information** | ✅ Yes | Time-to-live settings and expiration data | Preserved in export |\n" - "| **Index Information** | ✅ Yes | Primary key, secondary indices, projections | Preserved in export |\n" - "| **Statistics** | ✅ Yes | Column statistics and sampling information | Preserved in export |\n" + ), + link=None, + level=2, + num="13.1", +) + +RQ_ClickHouse_ExportPart_SchemaChangeIsolation = Requirement( + name="RQ.ClickHouse.ExportPart.SchemaChangeIsolation", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL ensure exported data is isolated from subsequent schema changes by:\n" + "* Preserving exported data exactly as it was at the time of export\n" + "* Not being affected by schema changes (column drops, renames, type changes) that occur after export\n" + "* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export\n" + "* Ensuring exported data reflects the source table state at the time of export, not the current state\n" "\n" - "[ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage.\n" + ), + link=None, + level=2, + num="13.2", +) + +RQ_ClickHouse_ExportPart_LargeParts = Requirement( + name="RQ.ClickHouse.ExportPart.LargeParts", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting large parts by:\n" + "* Handling parts with large numbers of rows (e.g., 100 million or more)\n" + "* Processing large data volumes efficiently during export\n" + "* Maintaining data integrity when exporting large parts\n" + "* Completing export operations successfully regardless of part size\n" "\n" ), link=None, level=2, - num="11.1", + num="13.3", ) RQ_ClickHouse_ExportPart_FailureHandling = Requirement( @@ -272,7 +348,72 @@ ), link=None, level=2, - num="12.1", + num="14.1", +) + +RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues = Requirement( + name="RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle network packet issues during export operations by:\n" + "* Tolerating packet delay without data corruption or loss\n" + "* Handling packet loss and retransmitting data as needed\n" + "* Detecting and handling packet corruption to ensure data integrity\n" + "* Managing packet duplication without data duplication in destination\n" + "* Handling packet reordering to maintain correct data sequence\n" + "* Operating correctly under packet rate limiting constraints\n" + "* Completing exports successfully despite network impairments\n" + "\n" + ), + link=None, + level=2, + num="15.1", +) + +RQ_ClickHouse_ExportPart_NetworkResilience_DestinationInterruption = Requirement( + name="RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle destination storage interruptions during export operations by:\n" + "* Detecting when destination storage becomes unavailable during export\n" + "* Failing export operations gracefully when destination storage is unavailable\n" + "* Logging failed exports in the `system.events` table with `PartsExportFailures` counter\n" + "* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability\n" + "* Allowing exports to complete successfully once destination storage becomes available again\n" + "\n" + ), + link=None, + level=2, + num="15.2", +) + +RQ_ClickHouse_ExportPart_NetworkResilience_NodeInterruption = Requirement( + name="RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by:\n" + "* Handling node restarts gracefully during export operations\n" + "* Not leaving partial or corrupted data in destination storage when node restarts occur\n" + "* With safe shutdown, ensuring exports complete successfully before node shutdown\n" + "* With unsafe shutdown, allowing partial exports to complete successfully after node restart\n" + "* Maintaining data integrity in destination storage regardless of node interruption type\n" + "\n" + ), + link=None, + level=2, + num="15.3", ) RQ_ClickHouse_ExportPart_Restrictions_SameTable = Requirement( @@ -291,7 +432,7 @@ ), link=None, level=3, - num="13.1.1", + num="16.1.1", ) RQ_ClickHouse_ExportPart_Restrictions_DestinationSupport = Requirement( @@ -312,7 +453,45 @@ ), link=None, level=3, - num="13.2.1", + num="16.2.1", +) + +RQ_ClickHouse_ExportPart_Restrictions_LocalTable = Requirement( + name="RQ.ClickHouse.ExportPart.Restrictions.LocalTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prevent exporting parts to local MergeTree tables by:\n" + "* Rejecting export operations where the destination table uses a MergeTree engine\n" + '* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + ), + link=None, + level=3, + num="16.3.1", +) + +RQ_ClickHouse_ExportPart_Restrictions_PartitionKey = Requirement( + name="RQ.ClickHouse.ExportPart.Restrictions.PartitionKey", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by:\n" + "* Checking that the partition key expression matches between source and destination tables\n" + '* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + ), + link=None, + level=3, + num="16.4.1", ) RQ_ClickHouse_ExportPart_Restrictions_SourcePart = Requirement( @@ -333,7 +512,7 @@ ), link=None, level=3, - num="13.3.1", + num="16.5.1", ) RQ_ClickHouse_ExportPart_Concurrency = Requirement( @@ -357,7 +536,7 @@ ), link=None, level=2, - num="14.1", + num="17.1", ) RQ_ClickHouse_ExportPart_Idempotency = Requirement( @@ -378,48 +557,7 @@ ), link=None, level=2, - num="15.1", -) - -RQ_ClickHouse_ExportPart_ErrorRecovery_GracefulFailure = Requirement( - name="RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure", - version="1.0", - priority=None, - group=None, - type=None, - uid=None, - description=( - "[ClickHouse] SHALL handle export failures gracefully by:\n" - "* Allowing users to retry failed export operations\n" - "* Maintaining system stability even when exports fail\n" - "* Not corrupting source data when export operations fail\n" - "* Continuing to process other export operations when one fails\n" - "\n" - ), - link=None, - level=3, - num="16.1.1", -) - -RQ_ClickHouse_ExportPart_ErrorRecovery_AutomaticCleanup = Requirement( - name="RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup", - version="1.0", - priority=None, - group=None, - type=None, - uid=None, - description=( - "[ClickHouse] SHALL automatically clean up failed export operations by:\n" - "* Removing export manifests from the system when operations fail\n" - "* Cleaning up any partial data written to destination storage\n" - "* Releasing system resources (memory, file handles) used by failed exports\n" - "* Updating export status to reflect the failure state\n" - "* Allowing the system to recover and process other export operations\n" - "\n" - ), - link=None, - level=3, - num="16.2.1", + num="18.1", ) RQ_ClickHouse_ExportPart_Logging = Requirement( @@ -435,20 +573,37 @@ "* Recording the specific part name and destination for all operations\n" "* Including execution time and progress information for all operations\n" "* Writing operation information to the `system.part_log` table with the following columns:\n" + " * `hostname` - Hostname of the server where the export operation occurred\n" + " * `query_id` - Query ID of the export operation\n" " * `event_type` - Set to `EXPORT_PART` for export operations\n" + " * `event_date` - Date when the export operation occurred\n" " * `event_time` - Timestamp when the export operation occurred\n" + " * `event_time_microseconds` - Timestamp with microsecond precision\n" + " * `duration_ms` - Execution time in milliseconds\n" + " * `database` - Source database name\n" " * `table` - Source table name\n" + " * `table_uuid` - UUID of the source table\n" " * `part_name` - Name of the part being exported\n" + " * `partition_id` - Partition ID of the part being exported\n" + " * `partition` - Partition name of the part being exported\n" + " * `part_type` - Type of the part (e.g., Wide, Compact)\n" + " * `disk_name` - Name of the disk where the part is stored\n" " * `path_on_disk` - Path to the part in source storage\n" - " * `duration_ms` - Execution time in milliseconds\n" + " * `rows` - Number of rows in the part\n" + " * `size_in_bytes` - Size of the part in bytes\n" + " * `bytes_uncompressed` - Uncompressed size of the part in bytes\n" + " * `read_rows` - Number of rows read during export\n" + " * `read_bytes` - Number of bytes read during export\n" + " * `peak_memory_usage` - Peak memory usage during the export operation\n" " * `error` - Error message if the export failed (empty for successful exports)\n" - " * `thread_id` - Thread ID performing the export\n" + " * `exception` - Exception details if the export failed\n" + " * `ProfileEvents` - Profile events collected during the export operation\n" "* Providing sufficient detail for monitoring and troubleshooting export operations\n" "\n" ), link=None, level=2, - num="17.1", + num="19.1", ) RQ_ClickHouse_ExportPart_SystemTables_Exports = Requirement( @@ -475,7 +630,7 @@ ), link=None, level=2, - num="18.1", + num="20.1", ) RQ_ClickHouse_ExportPart_Settings_AllowExperimental = Requirement( @@ -491,7 +646,7 @@ ), link=None, level=2, - num="19.1", + num="21.1", ) RQ_ClickHouse_ExportPart_Settings_OverwriteFile = Requirement( @@ -507,7 +662,7 @@ ), link=None, level=2, - num="20.1", + num="22.1", ) RQ_ClickHouse_ExportPart_ParallelFormatting = Requirement( @@ -527,7 +682,7 @@ ), link=None, level=2, - num="21.1", + num="23.1", ) RQ_ClickHouse_ExportPart_ServerSettings_MaxBandwidth = Requirement( @@ -543,7 +698,7 @@ ), link=None, level=2, - num="22.1", + num="24.1", ) RQ_ClickHouse_ExportPart_Events = Requirement( @@ -558,14 +713,11 @@ "* `PartsExports` - Number of successful part exports\n" "* `PartsExportFailures` - Number of failed part exports \n" "* `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" - "* `PartsExportTotalMilliseconds` - Total time spent on part export operations in milliseconds\n" - "* `ExportsThrottlerBytes` - Bytes passed through the exports throttler\n" - "* `ExportsThrottlerSleepMicroseconds` - Total time queries were sleeping to conform to export bandwidth throttling\n" "\n" ), link=None, level=2, - num="23.1", + num="25.1", ) RQ_ClickHouse_ExportPart_Metrics_Export = Requirement( @@ -581,7 +733,7 @@ ), link=None, level=2, - num="23.2", + num="25.2", ) RQ_ClickHouse_ExportPart_Security = Requirement( @@ -607,7 +759,7 @@ ), link=None, level=2, - num="24.1", + num="26.1", ) SRS_015_ClickHouse_Export_Part_to_S3 = Specification( @@ -630,115 +782,147 @@ Heading(name="Introduction", level=1, num="1"), Heading(name="Exporting Parts to S3", level=1, num="2"), Heading(name="RQ.ClickHouse.ExportPart.S3", level=2, num="2.1"), + Heading(name="RQ.ClickHouse.ExportPart.EmptyTable", level=2, num="2.2"), Heading(name="SQL command support", level=1, num="3"), Heading(name="RQ.ClickHouse.ExportPart.SQLCommand", level=2, num="3.1"), Heading(name="Supported source table engines", level=1, num="4"), Heading(name="RQ.ClickHouse.ExportPart.SourceEngines", level=2, num="4.1"), - Heading(name="Supported source part storage types", level=1, num="5"), - Heading(name="RQ.ClickHouse.ExportPart.SourcePartStorage", level=2, num="5.1"), - Heading(name="Supported destination table engines", level=1, num="6"), - Heading(name="RQ.ClickHouse.ExportPart.DestinationEngines", level=2, num="6.1"), - Heading(name="Destination setup and file management", level=1, num="7"), - Heading(name="RQ.ClickHouse.ExportPart.DestinationSetup", level=2, num="7.1"), - Heading(name="Export data preparation", level=1, num="8"), - Heading(name="RQ.ClickHouse.ExportPart.DataPreparation", level=2, num="8.1"), - Heading(name="Schema compatibility", level=1, num="9"), + Heading(name="Cluster and node support", level=1, num="5"), + Heading(name="RQ.ClickHouse.ExportPart.ClustersNodes", level=2, num="5.1"), + Heading(name="Supported source part storage types", level=1, num="6"), + Heading(name="RQ.ClickHouse.ExportPart.SourcePartStorage", level=2, num="6.1"), + Heading(name="Storage policies and volumes", level=1, num="7"), + Heading(name="RQ.ClickHouse.ExportPart.StoragePolicies", level=2, num="7.1"), + Heading(name="Supported destination table engines", level=1, num="8"), + Heading(name="RQ.ClickHouse.ExportPart.DestinationEngines", level=2, num="8.1"), + Heading(name="Destination setup and file management", level=1, num="9"), + Heading(name="RQ.ClickHouse.ExportPart.DestinationSetup", level=2, num="9.1"), + Heading(name="Export data preparation", level=1, num="10"), + Heading(name="RQ.ClickHouse.ExportPart.DataPreparation", level=2, num="10.1"), + Heading(name="Schema compatibility", level=1, num="11"), + Heading( + name="RQ.ClickHouse.ExportPart.SchemaCompatibility", level=2, num="11.1" + ), + Heading(name="Partition key types support", level=1, num="12"), + Heading(name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", level=2, num="12.1"), + Heading(name="Part types and content support", level=1, num="13"), + Heading(name="RQ.ClickHouse.ExportPart.PartTypes", level=2, num="13.1"), Heading( - name="RQ.ClickHouse.ExportPart.SchemaCompatibility", level=2, num="9.1" + name="RQ.ClickHouse.ExportPart.SchemaChangeIsolation", level=2, num="13.2" ), - Heading(name="Partition key types support", level=1, num="10"), - Heading(name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", level=2, num="10.1"), - Heading(name="Part types and content support", level=1, num="11"), - Heading(name="RQ.ClickHouse.ExportPart.PartTypes", level=2, num="11.1"), - Heading(name="Export operation failure handling", level=1, num="12"), - Heading(name="RQ.ClickHouse.ExportPart.FailureHandling", level=2, num="12.1"), - Heading(name="Export operation restrictions", level=1, num="13"), - Heading(name="Preventing same table exports", level=2, num="13.1"), + Heading(name="RQ.ClickHouse.ExportPart.LargeParts", level=2, num="13.3"), + Heading(name="Export operation failure handling", level=1, num="14"), + Heading(name="RQ.ClickHouse.ExportPart.FailureHandling", level=2, num="14.1"), + Heading(name="Network resilience", level=1, num="15"), + Heading( + name="RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues", + level=2, + num="15.1", + ), + Heading( + name="RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption", + level=2, + num="15.2", + ), + Heading( + name="RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption", + level=2, + num="15.3", + ), + Heading(name="Export operation restrictions", level=1, num="16"), + Heading(name="Preventing same table exports", level=2, num="16.1"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.SameTable", level=3, - num="13.1.1", + num="16.1.1", ), - Heading(name="Destination table compatibility", level=2, num="13.2"), + Heading(name="Destination table compatibility", level=2, num="16.2"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport", level=3, - num="13.2.1", + num="16.2.1", ), - Heading(name="Source part availability", level=2, num="13.3"), + Heading(name="Local table restriction", level=2, num="16.3"), Heading( - name="RQ.ClickHouse.ExportPart.Restrictions.SourcePart", + name="RQ.ClickHouse.ExportPart.Restrictions.LocalTable", level=3, - num="13.3.1", + num="16.3.1", ), - Heading(name="Export operation concurrency", level=1, num="14"), - Heading(name="RQ.ClickHouse.ExportPart.Concurrency", level=2, num="14.1"), - Heading(name="Export operation idempotency", level=1, num="15"), - Heading(name="RQ.ClickHouse.ExportPart.Idempotency", level=2, num="15.1"), - Heading(name="Export operation error recovery", level=1, num="16"), - Heading(name="Graceful failure handling", level=2, num="16.1"), + Heading(name="Partition key compatibility", level=2, num="16.4"), Heading( - name="RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure", + name="RQ.ClickHouse.ExportPart.Restrictions.PartitionKey", level=3, - num="16.1.1", + num="16.4.1", ), - Heading(name="Automatic cleanup on failure", level=2, num="16.2"), + Heading(name="Source part availability", level=2, num="16.5"), Heading( - name="RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup", + name="RQ.ClickHouse.ExportPart.Restrictions.SourcePart", level=3, - num="16.2.1", + num="16.5.1", ), - Heading(name="Export operation logging", level=1, num="17"), - Heading(name="RQ.ClickHouse.ExportPart.Logging", level=2, num="17.1"), - Heading(name="Monitoring export operations", level=1, num="18"), + Heading(name="Export operation concurrency", level=1, num="17"), + Heading(name="RQ.ClickHouse.ExportPart.Concurrency", level=2, num="17.1"), + Heading(name="Export operation idempotency", level=1, num="18"), + Heading(name="RQ.ClickHouse.ExportPart.Idempotency", level=2, num="18.1"), + Heading(name="Export operation logging", level=1, num="19"), + Heading(name="RQ.ClickHouse.ExportPart.Logging", level=2, num="19.1"), + Heading(name="Monitoring export operations", level=1, num="20"), Heading( - name="RQ.ClickHouse.ExportPart.SystemTables.Exports", level=2, num="18.1" + name="RQ.ClickHouse.ExportPart.SystemTables.Exports", level=2, num="20.1" ), - Heading(name="Enabling export functionality", level=1, num="19"), + Heading(name="Enabling export functionality", level=1, num="21"), Heading( name="RQ.ClickHouse.ExportPart.Settings.AllowExperimental", level=2, - num="19.1", + num="21.1", ), - Heading(name="Handling file conflicts during export", level=1, num="20"), + Heading(name="Handling file conflicts during export", level=1, num="22"), Heading( - name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", level=2, num="20.1" + name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", level=2, num="22.1" ), - Heading(name="Export operation configuration", level=1, num="21"), + Heading(name="Export operation configuration", level=1, num="23"), Heading( - name="RQ.ClickHouse.ExportPart.ParallelFormatting", level=2, num="21.1" + name="RQ.ClickHouse.ExportPart.ParallelFormatting", level=2, num="23.1" ), - Heading(name="Controlling export performance", level=1, num="22"), + Heading(name="Controlling export performance", level=1, num="24"), Heading( name="RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth", level=2, - num="22.1", + num="24.1", ), - Heading(name="Monitoring export performance metrics", level=1, num="23"), - Heading(name="RQ.ClickHouse.ExportPart.Events", level=2, num="23.1"), - Heading(name="RQ.ClickHouse.ExportPart.Metrics.Export", level=2, num="23.2"), - Heading(name="Export operation security", level=1, num="24"), - Heading(name="RQ.ClickHouse.ExportPart.Security", level=2, num="24.1"), + Heading(name="Monitoring export performance metrics", level=1, num="25"), + Heading(name="RQ.ClickHouse.ExportPart.Events", level=2, num="25.1"), + Heading(name="RQ.ClickHouse.ExportPart.Metrics.Export", level=2, num="25.2"), + Heading(name="Export operation security", level=1, num="26"), + Heading(name="RQ.ClickHouse.ExportPart.Security", level=2, num="26.1"), ), requirements=( RQ_ClickHouse_ExportPart_S3, + RQ_ClickHouse_ExportPart_EmptyTable, RQ_ClickHouse_ExportPart_SQLCommand, RQ_ClickHouse_ExportPart_SourceEngines, + RQ_ClickHouse_ExportPart_ClustersNodes, RQ_ClickHouse_ExportPart_SourcePartStorage, + RQ_ClickHouse_ExportPart_StoragePolicies, RQ_ClickHouse_ExportPart_DestinationEngines, RQ_ClickHouse_ExportPart_DestinationSetup, RQ_ClickHouse_ExportPart_DataPreparation, RQ_ClickHouse_ExportPart_SchemaCompatibility, RQ_ClickHouse_ExportPart_PartitionKeyTypes, RQ_ClickHouse_ExportPart_PartTypes, + RQ_ClickHouse_ExportPart_SchemaChangeIsolation, + RQ_ClickHouse_ExportPart_LargeParts, RQ_ClickHouse_ExportPart_FailureHandling, + RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues, + RQ_ClickHouse_ExportPart_NetworkResilience_DestinationInterruption, + RQ_ClickHouse_ExportPart_NetworkResilience_NodeInterruption, RQ_ClickHouse_ExportPart_Restrictions_SameTable, RQ_ClickHouse_ExportPart_Restrictions_DestinationSupport, + RQ_ClickHouse_ExportPart_Restrictions_LocalTable, + RQ_ClickHouse_ExportPart_Restrictions_PartitionKey, RQ_ClickHouse_ExportPart_Restrictions_SourcePart, RQ_ClickHouse_ExportPart_Concurrency, RQ_ClickHouse_ExportPart_Idempotency, - RQ_ClickHouse_ExportPart_ErrorRecovery_GracefulFailure, - RQ_ClickHouse_ExportPart_ErrorRecovery_AutomaticCleanup, RQ_ClickHouse_ExportPart_Logging, RQ_ClickHouse_ExportPart_SystemTables_Exports, RQ_ClickHouse_ExportPart_Settings_AllowExperimental, @@ -758,59 +942,69 @@ * 1 [Introduction](#introduction) * 2 [Exporting Parts to S3](#exporting-parts-to-s3) * 2.1 [RQ.ClickHouse.ExportPart.S3](#rqclickhouseexportparts3) + * 2.2 [RQ.ClickHouse.ExportPart.EmptyTable](#rqclickhouseexportpartemptytable) * 3 [SQL command support](#sql-command-support) * 3.1 [RQ.ClickHouse.ExportPart.SQLCommand](#rqclickhouseexportpartsqlcommand) * 4 [Supported source table engines](#supported-source-table-engines) * 4.1 [RQ.ClickHouse.ExportPart.SourceEngines](#rqclickhouseexportpartsourceengines) -* 5 [Supported source part storage types](#supported-source-part-storage-types) - * 5.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) -* 6 [Supported destination table engines](#supported-destination-table-engines) - * 6.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) -* 7 [Destination setup and file management](#destination-setup-and-file-management) - * 7.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) -* 8 [Export data preparation](#export-data-preparation) - * 8.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) -* 9 [Schema compatibility](#schema-compatibility) - * 9.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) -* 10 [Partition key types support](#partition-key-types-support) - * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) -* 11 [Part types and content support](#part-types-and-content-support) - * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) -* 12 [Export operation failure handling](#export-operation-failure-handling) - * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) -* 13 [Export operation restrictions](#export-operation-restrictions) - * 13.1 [Preventing same table exports](#preventing-same-table-exports) - * 13.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) - * 13.2 [Destination table compatibility](#destination-table-compatibility) - * 13.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) - * 13.3 [Source part availability](#source-part-availability) - * 13.3.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) -* 14 [Export operation concurrency](#export-operation-concurrency) - * 14.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) -* 15 [Export operation idempotency](#export-operation-idempotency) - * 15.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) -* 16 [Export operation error recovery](#export-operation-error-recovery) - * 16.1 [Graceful failure handling](#graceful-failure-handling) - * 16.1.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure](#rqclickhouseexportparterrorrecoverygracefulfailure) - * 16.2 [Automatic cleanup on failure](#automatic-cleanup-on-failure) - * 16.2.1 [RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup](#rqclickhouseexportparterrorrecoveryautomaticcleanup) -* 17 [Export operation logging](#export-operation-logging) - * 17.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) -* 18 [Monitoring export operations](#monitoring-export-operations) - * 18.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) -* 19 [Enabling export functionality](#enabling-export-functionality) - * 19.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) -* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 20.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) -* 21 [Export operation configuration](#export-operation-configuration) - * 21.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) -* 22 [Controlling export performance](#controlling-export-performance) - * 22.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) -* 23 [Monitoring export performance metrics](#monitoring-export-performance-metrics) - * 23.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) - * 23.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) -* 24 [Export operation security](#export-operation-security) - * 24.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) +* 5 [Cluster and node support](#cluster-and-node-support) + * 5.1 [RQ.ClickHouse.ExportPart.ClustersNodes](#rqclickhouseexportpartclustersnodes) +* 6 [Supported source part storage types](#supported-source-part-storage-types) + * 6.1 [RQ.ClickHouse.ExportPart.SourcePartStorage](#rqclickhouseexportpartsourcepartstorage) +* 7 [Storage policies and volumes](#storage-policies-and-volumes) + * 7.1 [RQ.ClickHouse.ExportPart.StoragePolicies](#rqclickhouseexportpartstoragepolicies) +* 8 [Supported destination table engines](#supported-destination-table-engines) + * 8.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) +* 9 [Destination setup and file management](#destination-setup-and-file-management) + * 9.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) +* 10 [Export data preparation](#export-data-preparation) + * 10.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) +* 11 [Schema compatibility](#schema-compatibility) + * 11.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) +* 12 [Partition key types support](#partition-key-types-support) + * 12.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) +* 13 [Part types and content support](#part-types-and-content-support) + * 13.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) + * 13.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) + * 13.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) +* 14 [Export operation failure handling](#export-operation-failure-handling) + * 14.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) +* 15 [Network resilience](#network-resilience) + * 15.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) + * 15.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) + * 15.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) +* 16 [Export operation restrictions](#export-operation-restrictions) + * 16.1 [Preventing same table exports](#preventing-same-table-exports) + * 16.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) + * 16.2 [Destination table compatibility](#destination-table-compatibility) + * 16.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) + * 16.3 [Local table restriction](#local-table-restriction) + * 16.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) + * 16.4 [Partition key compatibility](#partition-key-compatibility) + * 16.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) + * 16.5 [Source part availability](#source-part-availability) + * 16.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) +* 17 [Export operation concurrency](#export-operation-concurrency) + * 17.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) +* 18 [Export operation idempotency](#export-operation-idempotency) + * 18.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) +* 19 [Export operation logging](#export-operation-logging) + * 19.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) +* 20 [Monitoring export operations](#monitoring-export-operations) + * 20.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) +* 21 [Enabling export functionality](#enabling-export-functionality) + * 21.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) +* 22 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 22.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) +* 23 [Export operation configuration](#export-operation-configuration) + * 23.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) +* 24 [Controlling export performance](#controlling-export-performance) + * 24.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) +* 25 [Monitoring export performance metrics](#monitoring-export-performance-metrics) + * 25.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) + * 25.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) +* 26 [Export operation security](#export-operation-security) + * 26.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) ## Introduction @@ -823,6 +1017,15 @@ [ClickHouse] SHALL support exporting data parts from MergeTree engine tables to S3 object storage. +### RQ.ClickHouse.ExportPart.EmptyTable +version: 1.0 + +[ClickHouse] SHALL support exporting from empty tables by: +* Completing export operations successfully when the source table contains no parts +* Resulting in an empty destination table when exporting from an empty source table +* Not creating any files in destination storage when there are no parts to export +* Handling empty tables gracefully without errors + ## SQL command support ### RQ.ClickHouse.ExportPart.SQLCommand @@ -856,6 +1059,16 @@ * `GraphiteMergeTree` - MergeTree optimized for Graphite data * All other MergeTree family engines that inherit from `MergeTreeData` +## Cluster and node support + +### RQ.ClickHouse.ExportPart.ClustersNodes +version: 1.0 + +[ClickHouse] SHALL support exporting parts from multiple nodes in a cluster to the same destination storage, ensuring that: +* Each node can independently export parts from its local storage to the shared destination +* Exported data from different nodes is correctly aggregated in the destination +* All nodes in the cluster can read the same exported data from the destination + ## Supported source part storage types ### RQ.ClickHouse.ExportPart.SourcePartStorage @@ -870,6 +1083,19 @@ * **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold) * **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled +## Storage policies and volumes + +### RQ.ClickHouse.ExportPart.StoragePolicies +version: 1.0 + +[ClickHouse] SHALL support exporting parts from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including: +* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks +* **External Volumes**: Volumes using external storage systems +* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers +* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks +* Exporting parts regardless of which volume or disk within the storage policy contains the part +* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy + ## Supported destination table engines ### RQ.ClickHouse.ExportPart.DestinationEngines @@ -893,7 +1119,7 @@ * Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts * Allowing destination storage to determine the final file path based on Hive partitioning * Creating files in the destination storage that users can observe and access -* Providing the final destination file path in the `system.exports` table for monitoring +* Providing the final destination file path in the `system.part_log` table for monitoring ## Export data preparation @@ -945,33 +1171,26 @@ | **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | | **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | -[ClickHouse] SHALL handle all special columns and metadata present in parts during export: - -| Column Type | Supported | Description | Export Behavior | -|-------------|------------|-------------|-----------------| -| **Physical Columns** | ✅ Yes | User-defined table columns | All physical columns exported | -| **RowExistsColumn (_row_exists)** | ✅ Yes | Lightweight delete mask showing row existence | Exported to maintain delete state | -| **BlockNumberColumn (_block_number)** | ✅ Yes | Original block number from insert | Exported for row identification | -| **BlockOffsetColumn (_block_offset)** | ✅ Yes | Original row offset within block | Exported for row identification | -| **PartDataVersionColumn (_part_data_version)** | ✅ Yes | Data version for mutations | Exported for version tracking | -| **Virtual Columns** | ✅ Yes | Runtime columns like _part, _partition_id | Generated during export | -| **System Metadata** | ✅ Yes | Checksums, compression info, serialization | Preserved in export | - -[ClickHouse] SHALL handle all mutation and schema change information present in parts: - -| Mutation/Schema Type | Supported | Description | Export Behavior | -|---------------------|------------|-------------|-----------------| -| **Mutation Commands** | ✅ Yes | DELETE, UPDATE, MATERIALIZE_INDEX, DROP_COLUMN, RENAME_COLUMN | Applied during export | -| **Alter Conversions** | ✅ Yes | Column renames, type changes, schema modifications | Applied during export | -| **Patch Parts** | ✅ Yes | Lightweight updates with only changed columns | Applied during export | -| **Mutation Versions** | ✅ Yes | Version tracking for applied mutations | Preserved in export | -| **Schema Changes** | ✅ Yes | ALTER MODIFY, ALTER DROP, ALTER RENAME | Applied during export | -| **TTL Information** | ✅ Yes | Time-to-live settings and expiration data | Preserved in export | -| **Index Information** | ✅ Yes | Primary key, secondary indices, projections | Preserved in export | -| **Statistics** | ✅ Yes | Column statistics and sampling information | Preserved in export | - [ClickHouse] SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL handle all part metadata including checksums, compression information, serialization details, mutation history, schema changes, and structural modifications to maintain data integrity in the destination storage. +### RQ.ClickHouse.ExportPart.SchemaChangeIsolation +version: 1.0 + +[ClickHouse] SHALL ensure exported data is isolated from subsequent schema changes by: +* Preserving exported data exactly as it was at the time of export +* Not being affected by schema changes (column drops, renames, type changes) that occur after export +* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export +* Ensuring exported data reflects the source table state at the time of export, not the current state + +### RQ.ClickHouse.ExportPart.LargeParts +version: 1.0 + +[ClickHouse] SHALL support exporting large parts by: +* Handling parts with large numbers of rows (e.g., 100 million or more) +* Processing large data volumes efficiently during export +* Maintaining data integrity when exporting large parts +* Completing export operations successfully regardless of part size + ## Export operation failure handling ### RQ.ClickHouse.ExportPart.FailureHandling @@ -984,6 +1203,40 @@ * **Simple Failure**: Export operations either succeed completely or fail with an error message * **No Partial Exports**: Failed exports leave no partial or corrupted data in destination storage +## Network resilience + +### RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues +version: 1.0 + +[ClickHouse] SHALL handle network packet issues during export operations by: +* Tolerating packet delay without data corruption or loss +* Handling packet loss and retransmitting data as needed +* Detecting and handling packet corruption to ensure data integrity +* Managing packet duplication without data duplication in destination +* Handling packet reordering to maintain correct data sequence +* Operating correctly under packet rate limiting constraints +* Completing exports successfully despite network impairments + +### RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption +version: 1.0 + +[ClickHouse] SHALL handle destination storage interruptions during export operations by: +* Detecting when destination storage becomes unavailable during export +* Failing export operations gracefully when destination storage is unavailable +* Logging failed exports in the `system.events` table with `PartsExportFailures` counter +* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability +* Allowing exports to complete successfully once destination storage becomes available again + +### RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption +version: 1.0 + +[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by: +* Handling node restarts gracefully during export operations +* Not leaving partial or corrupted data in destination storage when node restarts occur +* With safe shutdown, ensuring exports complete successfully before node shutdown +* With unsafe shutdown, allowing partial exports to complete successfully after node restart +* Maintaining data integrity in destination storage regardless of node interruption type + ## Export operation restrictions ### Preventing same table exports @@ -1008,6 +1261,26 @@ * Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met * Performing this validation during the initial export setup phase +### Local table restriction + +#### RQ.ClickHouse.ExportPart.Restrictions.LocalTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting parts to local MergeTree tables by: +* Rejecting export operations where the destination table uses a MergeTree engine +* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table +* Performing this validation during the initial export setup phase + +### Partition key compatibility + +#### RQ.ClickHouse.ExportPart.Restrictions.PartitionKey +version: 1.0 + +[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by: +* Checking that the partition key expression matches between source and destination tables +* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ +* Performing this validation during the initial export setup phase + ### Source part availability #### RQ.ClickHouse.ExportPart.Restrictions.SourcePart @@ -1047,31 +1320,6 @@ * Generating unique file names using part name and checksum to avoid conflicts * Maintaining export state consistency across retries -## Export operation error recovery - -### Graceful failure handling - -#### RQ.ClickHouse.ExportPart.ErrorRecovery.GracefulFailure -version: 1.0 - -[ClickHouse] SHALL handle export failures gracefully by: -* Allowing users to retry failed export operations -* Maintaining system stability even when exports fail -* Not corrupting source data when export operations fail -* Continuing to process other export operations when one fails - -### Automatic cleanup on failure - -#### RQ.ClickHouse.ExportPart.ErrorRecovery.AutomaticCleanup -version: 1.0 - -[ClickHouse] SHALL automatically clean up failed export operations by: -* Removing export manifests from the system when operations fail -* Cleaning up any partial data written to destination storage -* Releasing system resources (memory, file handles) used by failed exports -* Updating export status to reflect the failure state -* Allowing the system to recover and process other export operations - ## Export operation logging ### RQ.ClickHouse.ExportPart.Logging @@ -1082,14 +1330,31 @@ * Recording the specific part name and destination for all operations * Including execution time and progress information for all operations * Writing operation information to the `system.part_log` table with the following columns: + * `hostname` - Hostname of the server where the export operation occurred + * `query_id` - Query ID of the export operation * `event_type` - Set to `EXPORT_PART` for export operations + * `event_date` - Date when the export operation occurred * `event_time` - Timestamp when the export operation occurred + * `event_time_microseconds` - Timestamp with microsecond precision + * `duration_ms` - Execution time in milliseconds + * `database` - Source database name * `table` - Source table name + * `table_uuid` - UUID of the source table * `part_name` - Name of the part being exported + * `partition_id` - Partition ID of the part being exported + * `partition` - Partition name of the part being exported + * `part_type` - Type of the part (e.g., Wide, Compact) + * `disk_name` - Name of the disk where the part is stored * `path_on_disk` - Path to the part in source storage - * `duration_ms` - Execution time in milliseconds + * `rows` - Number of rows in the part + * `size_in_bytes` - Size of the part in bytes + * `bytes_uncompressed` - Uncompressed size of the part in bytes + * `read_rows` - Number of rows read during export + * `read_bytes` - Number of bytes read during export + * `peak_memory_usage` - Peak memory usage during the export operation * `error` - Error message if the export failed (empty for successful exports) - * `thread_id` - Thread ID performing the export + * `exception` - Exception details if the export failed + * `ProfileEvents` - Profile events collected during the export operation * Providing sufficient detail for monitoring and troubleshooting export operations ## Monitoring export operations @@ -1151,9 +1416,6 @@ * `PartsExports` - Number of successful part exports * `PartsExportFailures` - Number of failed part exports * `PartsExportDuplicated` - Number of part exports that failed because target already exists -* `PartsExportTotalMilliseconds` - Total time spent on part export operations in milliseconds -* `ExportsThrottlerBytes` - Bytes passed through the exports throttler -* `ExportsThrottlerSleepMicroseconds` - Total time queries were sleeping to conform to export bandwidth throttling ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/tests/export_part/clusters_nodes.py b/s3/tests/export_part/clusters_nodes.py index b7e280e93..cfd2ed209 100644 --- a/s3/tests/export_part/clusters_nodes.py +++ b/s3/tests/export_part/clusters_nodes.py @@ -4,6 +4,7 @@ from s3.tests.export_part.steps import * from helpers.queries import * from alter.table.replace_partition.common import create_partitions_with_random_uint64 +from s3.requirements.export_part import * @TestScenario @@ -53,6 +54,7 @@ def different_nodes_same_destination(self, cluster, node1, node2): @TestFeature +@Requirements(RQ_ClickHouse_ExportPart_ClustersNodes("1.0")) @Name("clusters and nodes") def feature(self): """Check functionality of exporting data parts to S3 storage from different clusters and nodes.""" diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index d6b4cd9ed..cc5884b54 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -7,6 +7,7 @@ @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) def basic_concurrent_export(self, threads): """Check concurrent exports from different sources to the same S3 table.""" @@ -43,6 +44,7 @@ def basic_concurrent_export(self, threads): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_delay(self, delay_ms): """Check that exports work correctly with packet delay.""" @@ -75,6 +77,7 @@ def packet_delay(self, delay_ms): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_loss(self, percent_loss): """Check that exports work correctly with packet loss.""" @@ -107,6 +110,7 @@ def packet_loss(self, percent_loss): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_loss_gemodel(self, interruption_probability, recovery_probability): """Check that exports work correctly with packet loss using the GE model.""" @@ -143,6 +147,7 @@ def packet_loss_gemodel(self, interruption_probability, recovery_probability): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_corruption(self, percent_corrupt): """Check that exports work correctly with packet corruption.""" @@ -177,6 +182,7 @@ def packet_corruption(self, percent_corrupt): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_duplication(self, percent_duplicated): """Check that exports work correctly with packet corruption.""" @@ -211,6 +217,7 @@ def packet_duplication(self, percent_duplicated): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_reordering(self, delay_ms, percent_reordered): """Check that exports work correctly with packet corruption.""" @@ -247,6 +254,7 @@ def packet_reordering(self, delay_ms, percent_reordered): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) def packet_rate_limit(self, rate_mbit): """Check that exports work correctly with packet corruption.""" @@ -279,6 +287,7 @@ def packet_rate_limit(self, rate_mbit): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) def concurrent_insert(self): """Check that exports work correctly with concurrent inserts of source data.""" @@ -386,6 +395,7 @@ def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=30 @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_DestinationInterruption("1.0")) def minio_network_interruption(self, number_of_values=3, signal="KILL"): """Check that restarting MinIO while exporting parts inbetween works correctly.""" @@ -427,6 +437,7 @@ def minio_network_interruption(self, number_of_values=3, signal="KILL"): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_NodeInterruption("1.0")) def clickhouse_network_interruption(self, safe=False): """Check that exports work correctly with a clickhouse network outage.""" @@ -468,7 +479,6 @@ def clickhouse_network_interruption(self, safe=False): @TestFeature -@Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) @Name("concurrency and networks") def feature(self): """Check that exports work correctly with concurrency and various network conditions.""" diff --git a/s3/tests/export_part/datatypes.py b/s3/tests/export_part/datatypes.py index f1ee4584b..a7f1af231 100644 --- a/s3/tests/export_part/datatypes.py +++ b/s3/tests/export_part/datatypes.py @@ -91,7 +91,6 @@ def valid_partition_key_table(self, partition_key_type, rows_per_part=1): @TestSketch(Scenario) @Flags(TE) -@Requirements(RQ_ClickHouse_ExportPart_PartitionKeyTypes("1.0")) def valid_partition_key_types_compact(self): """Check that all partition key data types are supported when exporting compact parts.""" diff --git a/s3/tests/export_part/engines_volumes.py b/s3/tests/export_part/engines_volumes.py index 04e298c7b..d944cc1d5 100644 --- a/s3/tests/export_part/engines_volumes.py +++ b/s3/tests/export_part/engines_volumes.py @@ -5,9 +5,6 @@ from helpers.queries import * -# TODO replicated merge tree tables (all types) - - @TestCheck def configured_table(self, table_engine, number_of_partitions, number_of_parts): """Test a specific combination of table engine, number of partitions, and number of parts.""" @@ -111,6 +108,7 @@ def configured_volume(self, volume): @TestSketch(Scenario) @Flags(TE) +@Requirements(RQ_ClickHouse_ExportPart_StoragePolicies("1.0")) def volume_combos(self): """Test exporting to various storage policies.""" @@ -138,5 +136,7 @@ def volume_combos(self): def feature(self): """Check exporting parts to S3 storage with different table engines and volumes.""" + # TODO replicated merge tree tables (all types) + Scenario(run=table_combos) Scenario(run=volume_combos) diff --git a/s3/tests/export_part/error_handling.py b/s3/tests/export_part/error_handling.py index b4da3cb31..f2160696d 100644 --- a/s3/tests/export_part/error_handling.py +++ b/s3/tests/export_part/error_handling.py @@ -6,6 +6,7 @@ @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Restrictions_SourcePart("1.0")) def invalid_part_name(self): """Check that exporting a non-existent part returns the correct error.""" @@ -67,6 +68,7 @@ def same_table(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Restrictions_LocalTable("1.0")) def local_table(self): """Test exporting parts to a local table.""" @@ -132,6 +134,7 @@ def disable_export_setting(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Restrictions_PartitionKey("1.0")) def different_partition_key(self): """Check exporting parts with a different partition key returns the correct error.""" diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 2131102d3..0e4f20590 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -118,6 +118,7 @@ def basic_table(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_EmptyTable("1.0")) def empty_table(self): """Test exporting parts from an empty table.""" @@ -205,6 +206,7 @@ def wide_and_compact_parts(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_SchemaChangeIsolation("1.0")) def export_and_drop(self): """Check that dropping a column immediately after export doesn't affect exported data.""" @@ -244,6 +246,7 @@ def export_and_drop(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_LargeParts("1.0")) def large_part(self): """Test exporting a large part.""" From 8903b7c0f132bcf4fe6a9abbc604d060d7d5e19e Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 21:40:49 +0400 Subject: [PATCH 70/99] small fix --- helpers/cluster.py | 8 ++++++-- helpers/tables.py | 10 ++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/helpers/cluster.py b/helpers/cluster.py index 71f1e4f12..8007221ac 100755 --- a/helpers/cluster.py +++ b/helpers/cluster.py @@ -1081,8 +1081,12 @@ def query( query_settings += [("query_id", f"{query_id}")] if inline_settings: - sql = "; ".join([f"SET {name} = {value}" for name, value in inline_settings]) + "; " + sql - + sql = ( + "; ".join([f"SET {name} = {value}" for name, value in inline_settings]) + + "; " + + sql + ) + client = "clickhouse client -n" if secure: client += ( diff --git a/helpers/tables.py b/helpers/tables.py index 8790936cb..19b6a73db 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -570,16 +570,18 @@ def create_partitioned_table_with_compact_and_wide_parts( min_rows_for_wide_part=10, min_bytes_for_wide_part=100, engine="MergeTree", - columns=[ - Column(name="p", datatype=UInt8()), - Column(name="i", datatype=UInt64()), - ], + columns=None, partition_by="p", cluster=None, stop_merges=False, ): """Create a partitioned table that has specific settings in order to get both wide and compact parts.""" + if columns is None: + columns = [ + Column(name="p", datatype=UInt8()), + Column(name="i", datatype=UInt64()), + ] create_table( name=table_name, engine=engine, From 0fd10d2dd465b8f02d1c9dc07aa2b319bce19aab Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 21:43:02 +0400 Subject: [PATCH 71/99] get steps to True again --- helpers/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helpers/cluster.py b/helpers/cluster.py index 8007221ac..b9133ce3d 100755 --- a/helpers/cluster.py +++ b/helpers/cluster.py @@ -1006,7 +1006,7 @@ def query( sql, message=None, exitcode=None, - steps=False, + steps=True, no_checks=False, raise_on_exception=False, ignore_exception=False, From 6cc638f39b31e2b11d61e602165166b707cb50f5 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 21:54:40 +0400 Subject: [PATCH 72/99] remove apt for now, need to move to Dockerfile --- s3/regression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/s3/regression.py b/s3/regression.py index 661ca8a26..5a24598d3 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -552,9 +552,9 @@ def minio_regression( for node in nodes["clickhouse"]: experimental_analyzer(node=cluster.node(node), with_analyzer=with_analyzer) - with And("I install tc-netem on all clickhouse nodes"): - for node in self.context.nodes: - node.command("apt install --yes iproute2 procps") + # with And("I install tc-netem on all clickhouse nodes"): + # for node in self.context.nodes: + # node.command("apt install --yes iproute2 procps") # with And("allow higher cpu_wait_ratio "): # if check_clickhouse_version(">=25.4")(self): From 9d3676faeb279ed5b4060f72a8c92a622b403c6a Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 22:01:52 +0400 Subject: [PATCH 73/99] sort the output --- s3/tests/export_part/steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index e01788ed7..4a2a6c6da 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -104,7 +104,7 @@ def get_parts(self, table_name, node): exitcode=0, steps=True, ).output - return [row.strip() for row in output.splitlines()] + return sorted([row.strip() for row in output.splitlines()]) @TestStep(When) From 776b6a865635e91e6050797d41b3f8aa65bcc23d Mon Sep 17 00:00:00 2001 From: Selfeer Date: Thu, 6 Nov 2025 22:11:37 +0400 Subject: [PATCH 74/99] change the static table name to getuid --- s3/tests/export_part/concurrency_networks.py | 25 ++++--- s3/tests/export_part/sanity.py | 73 +++++++++++++------- s3/tests/export_part/steps.py | 6 +- 3 files changed, 65 insertions(+), 39 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index cc5884b54..28e4d51bd 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -336,10 +336,10 @@ def concurrent_insert(self): @TestStep(When) def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): """Forcefully kill MinIO container to simulate network crash.""" - + if cluster is None: cluster = self.context.cluster - + retry(cluster.command, 5)( None, f"docker kill --signal={signal} {container_name}", @@ -367,10 +367,10 @@ def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KIL @TestStep(When) def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): """Start MinIO container and wait for it to be ready.""" - + if cluster is None: cluster = self.context.cluster - + with By("starting MinIO container"): retry(cluster.command, 5)( None, @@ -379,7 +379,7 @@ def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=30 exitcode=0, steps=True, ) - + with And("waiting for MinIO to be ready"): for attempt in retries(timeout=timeout, delay=1): with attempt: @@ -430,17 +430,20 @@ def minio_network_interruption(self, number_of_values=3, signal="KILL"): table_name=s3_table_name, node=self.context.node ) assert set(source_data) >= set(destination_data), error() - + with And("Failed exports should be logged in the system.events table"): final_events = get_export_events(node=self.context.node) - assert final_events["PartsExportFailures"] - initial_events["PartsExportFailures"] == (len(source_data) - len(destination_data)) / number_of_values, error() + assert ( + final_events["PartsExportFailures"] - initial_events["PartsExportFailures"] + == (len(source_data) - len(destination_data)) / number_of_values + ), error() @TestScenario @Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_NodeInterruption("1.0")) def clickhouse_network_interruption(self, safe=False): """Check that exports work correctly with a clickhouse network outage.""" - + with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( table_name="source", @@ -471,7 +474,9 @@ def clickhouse_network_interruption(self, safe=False): ) else: with Then("Destination data should be a subset of source data"): - source_data = select_all_ordered(table_name="source", node=self.context.node) + source_data = select_all_ordered( + table_name="source", node=self.context.node + ) destination_data = select_all_ordered( table_name=s3_table_name, node=self.context.node ) @@ -499,4 +504,4 @@ def feature(self): Scenario(test=minio_network_interruption)(signal="TERM") Scenario(test=minio_network_interruption)(signal="KILL") Scenario(test=clickhouse_network_interruption)(safe=True) - Scenario(test=clickhouse_network_interruption)(safe=False) \ No newline at end of file + Scenario(test=clickhouse_network_interruption)(safe=False) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index 0e4f20590..c4532492a 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -15,8 +15,10 @@ def export_setting(self): """Check that the export setting is settable in 2 ways when exporting parts.""" with Given("I create a populated source table and 2 empty S3 tables"): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="p", columns=default_columns(), stop_merges=True, @@ -26,7 +28,7 @@ def export_setting(self): with When("I export parts to the first S3 table using the SET query"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name1, node=self.context.node, inline_settings=True, @@ -34,7 +36,7 @@ def export_setting(self): with And("I export parts to the second S3 table using the settings argument"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name2, node=self.context.node, inline_settings=False, @@ -42,7 +44,9 @@ def export_setting(self): ) with And("I read data from all tables"): - source_data = select_all_ordered(table_name="source", node=self.context.node) + source_data = select_all_ordered( + table_name=source_table, node=self.context.node + ) destination_data1 = select_all_ordered( table_name=s3_table_name1, node=self.context.node ) @@ -61,8 +65,10 @@ def mismatched_columns(self): """Test exporting parts when source and destination tables have mismatched columns.""" with Given("I create a source table and S3 table with different columns"): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="p", columns=default_columns(), stop_merges=True, @@ -75,7 +81,7 @@ def mismatched_columns(self): with When("I export parts to the S3 table"): results = export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, exitcode=1, @@ -95,8 +101,10 @@ def basic_table(self): """Test exporting parts of a basic table.""" with Given("I create a populated source table and empty S3 table"): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="p", columns=default_columns(), stop_merges=True, @@ -105,14 +113,14 @@ def basic_table(self): with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Check source matches destination"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) @@ -158,8 +166,10 @@ def no_partition_by(self): """Test exporting parts when the source table has no PARTITION BY type.""" with Given("I create a populated source table and empty S3 table"): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="tuple()", columns=default_columns(), stop_merges=True, @@ -170,14 +180,14 @@ def no_partition_by(self): with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Check source matches destination"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) @@ -188,19 +198,21 @@ def wide_and_compact_parts(self): """Check that exporting with both wide and compact parts is supported.""" with Given("I create a source table with wide and compact parts"): - table_with_compact_and_wide_parts(table_name="source") + source_table = "source_" + getuid() + + table_with_compact_and_wide_parts(table_name=source_table) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Check source matches destination"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) @@ -209,12 +221,17 @@ def wide_and_compact_parts(self): @Requirements(RQ_ClickHouse_ExportPart_SchemaChangeIsolation("1.0")) def export_and_drop(self): """Check that dropping a column immediately after export doesn't affect exported data.""" - - with Given("I create a populated source table and empty S3 table", description=""" + + with Given( + "I create a populated source table and empty S3 table", + description=""" Stop merges must be false to allow for mutations like dropping a column. - """): + """, + ): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="p", columns=default_columns(), stop_merges=False, @@ -223,18 +240,20 @@ def export_and_drop(self): with When("I export data"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with And("I read the source before dropping a column"): - source_data = select_all_ordered(table_name="source", node=self.context.node) - + source_data = select_all_ordered( + table_name=source_table, node=self.context.node + ) + with And("I drop a source column"): drop_column( node=self.context.node, - table_name="source", + table_name=source_table, column_name="i", ) @@ -251,8 +270,10 @@ def large_part(self): """Test exporting a large part.""" with Given("I create a populated source table and empty S3 table"): + source_table = "source_" + getuid() + partitioned_merge_tree_table( - table_name="source", + table_name=source_table, partition_by="p", columns=default_columns(), stop_merges=True, @@ -264,14 +285,14 @@ def large_part(self): with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Check source matches destination"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 4a2a6c6da..8026baa39 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -116,7 +116,7 @@ def export_parts( parts=None, exitcode=0, settings=None, - inline_settings=True + inline_settings=True, ): """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" @@ -125,10 +125,10 @@ def export_parts( if inline_settings is True: inline_settings = self.context.default_settings - + no_checks = exitcode != 0 output = [] - + for part in parts: output.append( node.query( From b21319cf655477e8835e382d8305f6fdc72f94da Mon Sep 17 00:00:00 2001 From: Selfeer Date: Fri, 7 Nov 2025 00:59:54 +0400 Subject: [PATCH 75/99] fix temporary tables --- helpers/tables.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/helpers/tables.py b/helpers/tables.py index 19b6a73db..54fe10e1d 100644 --- a/helpers/tables.py +++ b/helpers/tables.py @@ -343,7 +343,7 @@ def generate_all_map_column_types(): class Table: - def __init__(self, name, columns, partition_by, engine): + def __init__(self, name, columns, engine, partition_by=None): self.name = name self.columns = columns self.partition_by = partition_by @@ -556,7 +556,7 @@ def create_temporary_table( settings=settings, ) - yield Table(name, columns, engine) + yield Table(name, columns, engine, partition_by=partition_by) finally: with Finally(f"drop the table {name}"): @@ -648,7 +648,9 @@ def attach_table(self, engine, columns, name=None, path=None, drop_sync=False): """ ) - yield Table(name, columns, engine) + yield Table( + name=name, columns=columns, partition_by=None, engine=engine + ) finally: with Finally(f"drop the table {name}"): From 0c5381510645769ddac0d0d395f082cf84719861 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 16:16:56 -0500 Subject: [PATCH 76/99] Combinatorics cleanup and parallelization --- helpers/create.py | 16 ++++++ s3/tests/export_part/engines_volumes.py | 66 ++++++++++++++----------- 2 files changed, 54 insertions(+), 28 deletions(-) diff --git a/helpers/create.py b/helpers/create.py index b085821f6..16b19c8f3 100644 --- a/helpers/create.py +++ b/helpers/create.py @@ -431,6 +431,8 @@ def partitioned_merge_tree_table( number_of_values=number_of_values, ) + return table_name + @TestStep(Given) def partitioned_replicated_merge_tree_table( @@ -464,6 +466,8 @@ def partitioned_replicated_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_replacing_merge_tree_table( @@ -497,6 +501,8 @@ def partitioned_replacing_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_summing_merge_tree_table( @@ -530,6 +536,8 @@ def partitioned_summing_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_collapsing_merge_tree_table( @@ -564,6 +572,8 @@ def partitioned_collapsing_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_versioned_collapsing_merge_tree_table( @@ -599,6 +609,8 @@ def partitioned_versioned_collapsing_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_aggregating_merge_tree_table( @@ -632,6 +644,8 @@ def partitioned_aggregating_merge_tree_table( number_of_parts=number_of_parts, ) + return table_name + @TestStep(Given) def partitioned_graphite_merge_tree_table( @@ -665,3 +679,5 @@ def partitioned_graphite_merge_tree_table( number_of_partitions=number_of_partitions, number_of_parts=number_of_parts, ) + + return table_name \ No newline at end of file diff --git a/s3/tests/export_part/engines_volumes.py b/s3/tests/export_part/engines_volumes.py index d944cc1d5..8f7eec45b 100644 --- a/s3/tests/export_part/engines_volumes.py +++ b/s3/tests/export_part/engines_volumes.py @@ -3,6 +3,8 @@ from s3.tests.export_part.steps import * from s3.requirements.export_part import * from helpers.queries import * +from helpers.common import getuid +from testflows.combinatorics import product @TestCheck @@ -10,8 +12,8 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): """Test a specific combination of table engine, number of partitions, and number of parts.""" with Given("I create a populated source table and empty S3 table"): - table_engine( - table_name="source", + source_table = table_engine( + table_name=f"source_{getuid()}", partition_by="p", stop_merges=True, number_of_partitions=number_of_partitions, @@ -26,14 +28,14 @@ def configured_table(self, table_engine, number_of_partitions, number_of_parts): with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Source and destination tables should match"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) @@ -56,18 +58,21 @@ def table_combos(self): number_of_partitions = [5] if not self.context.stress else [1, 5, 10] number_of_parts = [1] if not self.context.stress else [1, 5, 10] - table_engine = either(*tables) - number_of_partitions = either(*number_of_partitions) - number_of_parts = either(*number_of_parts) + combinations = product(tables, number_of_partitions, number_of_parts) - Combination( - name=f"{table_engine.__name__} partitions={number_of_partitions} parts={number_of_parts}", - test=configured_table, - )( - table_engine=table_engine, - number_of_partitions=number_of_partitions, - number_of_parts=number_of_parts, - ) + with Pool(16) as executor: + for (table_engine, number_of_partitions, number_of_parts) in combinations: + Combination( + name=f"{table_engine.__name__} partitions={number_of_partitions} parts={number_of_parts}", + test=configured_table, + executor=executor, + parallel=True, + )( + table_engine=table_engine, + number_of_partitions=number_of_partitions, + number_of_parts=number_of_parts, + ) + join() @TestCheck @@ -75,8 +80,8 @@ def configured_volume(self, volume): """Test a specific combination of volume.""" with Given(f"I create an empty source table on volume {volume} and empty S3 table"): - partitioned_merge_tree_table( - table_name="source", + source_table = partitioned_merge_tree_table( + table_name=f"source_{getuid()}", partition_by="p", columns=default_columns(), stop_merges=True, @@ -87,21 +92,21 @@ def configured_volume(self, volume): with And("I populate the source table with parts exceeding 2KB each"): create_partitions_with_random_uint64( - table_name="source", + table_name=source_table, node=self.context.node, number_of_values=500, ) with When("I export parts to the S3 table"): export_parts( - source_table="source", + source_table=source_table, destination_table=s3_table_name, node=self.context.node, ) with Then("Source and destination tables should match"): source_matches_destination( - source_table="source", + source_table=source_table, destination_table=s3_table_name, ) @@ -121,14 +126,19 @@ def volume_combos(self): "external2", "tiered_storage", ] - volume = either(*volumes) - - Combination( - name=f"volume={volume}", - test=configured_volume, - )( - volume=volume, - ) + combinations = product(volumes) + + with Pool(16) as executor: + for (volume,) in combinations: + Combination( + name=f"volume={volume}", + test=configured_volume, + executor=executor, + parallel=True, + )( + volume=volume, + ) + join() @TestFeature From ac45d0dba25659aecc25e10488cbb9b331115615 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Fri, 7 Nov 2025 01:43:53 +0400 Subject: [PATCH 77/99] remove storage.xml, move the logic to dynamically generate this file on the test run --- s3/s3_env/clickhouse-service.yml | 1 - s3/tests/export_part/engines_volumes.py | 6 ++- s3/tests/export_part/steps.py | 58 ++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/s3/s3_env/clickhouse-service.yml b/s3/s3_env/clickhouse-service.yml index 0745cc3e1..c766d2085 100755 --- a/s3/s3_env/clickhouse-service.yml +++ b/s3/s3_env/clickhouse-service.yml @@ -21,7 +21,6 @@ services: - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/ssl.xml:/etc/clickhouse-server/config.d/ssl.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/system_unfreeze.xml:/etc/clickhouse-server/config.d/system_unfreeze.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/ssl:/etc/clickhouse-server/ssl" - - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/config.d/storage.xml:/etc/clickhouse-server/config.d/storage.xml" - "${CLICKHOUSE_TESTS_DIR}/configs/clickhouse/users.d/output_format_json_quote_64bit_integers.xml:/etc/clickhouse-server/users.d/output_format_json_quote_64bit_integers.xml" cap_add: - NET_ADMIN \ No newline at end of file diff --git a/s3/tests/export_part/engines_volumes.py b/s3/tests/export_part/engines_volumes.py index 8f7eec45b..150f409f8 100644 --- a/s3/tests/export_part/engines_volumes.py +++ b/s3/tests/export_part/engines_volumes.py @@ -2,6 +2,7 @@ from testflows.asserts import error from s3.tests.export_part.steps import * from s3.requirements.export_part import * +from s3.tests.common import s3_storage from helpers.queries import * from helpers.common import getuid from testflows.combinatorics import product @@ -61,7 +62,7 @@ def table_combos(self): combinations = product(tables, number_of_partitions, number_of_parts) with Pool(16) as executor: - for (table_engine, number_of_partitions, number_of_parts) in combinations: + for table_engine, number_of_partitions, number_of_parts in combinations: Combination( name=f"{table_engine.__name__} partitions={number_of_partitions} parts={number_of_parts}", test=configured_table, @@ -148,5 +149,8 @@ def feature(self): # TODO replicated merge tree tables (all types) + with Given("I set up MinIO storage configuration"): + minio_storage_configuration(restart=True) + Scenario(run=table_combos) Scenario(run=volume_combos) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 8026baa39..5a3490670 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -5,7 +5,63 @@ from helpers.common import getuid from helpers.create import * from helpers.queries import * -from s3.tests.common import temporary_bucket_path +from s3.tests.common import temporary_bucket_path, s3_storage + + +@TestStep(Given) +def minio_storage_configuration(self, restart=True): + """Create storage configuration with jbod disks, MinIO S3 disk, and tiered storage policy.""" + with Given( + "I configure storage with jbod disks, MinIO S3 disk, and tiered storage" + ): + disks = { + "jbod1": {"path": "/jbod1/"}, + "jbod2": {"path": "/jbod2/"}, + "jbod3": {"path": "/jbod3/"}, + "jbod4": {"path": "/jbod4/"}, + "external": {"path": "/external/"}, + "external2": {"path": "/external2/"}, + "minio": { + "type": "s3", + "endpoint": "http://minio1:9001/root/data/", + "access_key_id": "minio_user", + "secret_access_key": "minio123", + }, + "s3_cache": { + "type": "cache", + "disk": "minio", + "path": "minio_cache/", + "max_size": "22548578304", + "cache_on_write_operations": "1", + }, + } + + policies = { + "jbod1": {"volumes": {"main": {"disk": "jbod1"}}}, + "jbod2": {"volumes": {"main": {"disk": "jbod2"}}}, + "jbod3": {"volumes": {"main": {"disk": "jbod3"}}}, + "jbod4": {"volumes": {"main": {"disk": "jbod4"}}}, + "external": {"volumes": {"main": {"disk": "external"}}}, + "external2": {"volumes": {"main": {"disk": "external2"}}}, + "tiered_storage": { + "volumes": { + "hot": [ + {"disk": "jbod1"}, + {"disk": "jbod2"}, + {"max_data_part_size_bytes": "2048"}, + ], + "cold": [ + {"disk": "external"}, + {"disk": "external2"}, + ], + }, + "move_factor": "0.7", + }, + "s3_cache": {"volumes": {"external": {"disk": "s3_cache"}}}, + "minio_external_nocache": {"volumes": {"external": {"disk": "minio"}}}, + } + + s3_storage(disks=disks, policies=policies, restart=restart) def default_columns(simple=True, partition_key_type="UInt8"): From 94603a0de2f6f66f572bb2b355891507316bb31e Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 16:49:55 -0500 Subject: [PATCH 78/99] Drop column query --- helpers/queries.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/helpers/queries.py b/helpers/queries.py index 42f0835f9..bed43009b 100644 --- a/helpers/queries.py +++ b/helpers/queries.py @@ -148,3 +148,12 @@ def get_column_string(self, node: ClickHouseNode, table_name: str, timeout=30) - timeout=timeout, ) return ",".join([l.strip() for l in r.output.splitlines()]) + + +@TestStep(When) +def drop_column(self, node, table_name, column_name): + """Drop a column from a table.""" + + node.query( + f"ALTER TABLE {table_name} DROP COLUMN {column_name}", exitcode=0, steps=True + ) \ No newline at end of file From 68e5e7cc86b024753fac4265711569d95fe55b55 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 17:03:26 -0500 Subject: [PATCH 79/99] Clean up --- s3/requirements/export_part.md | 1 + s3/requirements/export_part.py | 2 + s3/tests/export_part/concurrency_networks.py | 2 + s3/tests/export_part/engines_volumes.py | 2 +- s3/tests/export_part/feature.py | 2 +- s3/tests/export_part/sanity.py | 1 - s3/tests/export_part/steps.py | 19 +++++-- s3/tests/export_part/system_monitoring.py | 56 +++++++++++--------- 8 files changed, 51 insertions(+), 34 deletions(-) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index eac61e0d2..b65ae40b9 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -480,6 +480,7 @@ version: 1.0 * `PartsExports` - Number of successful part exports * `PartsExportFailures` - Number of failed part exports * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index f6072746d..44bc09d63 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -713,6 +713,7 @@ "* `PartsExports` - Number of successful part exports\n" "* `PartsExportFailures` - Number of failed part exports \n" "* `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" + "* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export\n" "\n" ), link=None, @@ -1416,6 +1417,7 @@ * `PartsExports` - Number of successful part exports * `PartsExportFailures` - Number of failed part exports * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 28e4d51bd..e88dd42dc 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -405,6 +405,7 @@ def minio_network_interruption(self, number_of_values=3, signal="KILL"): partition_by="p", columns=default_columns(), number_of_values=number_of_values, + stop_merges=True, ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) @@ -449,6 +450,7 @@ def clickhouse_network_interruption(self, safe=False): table_name="source", partition_by="p", columns=default_columns(), + stop_merges=True, ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) diff --git a/s3/tests/export_part/engines_volumes.py b/s3/tests/export_part/engines_volumes.py index 150f409f8..8a3c4cf6a 100644 --- a/s3/tests/export_part/engines_volumes.py +++ b/s3/tests/export_part/engines_volumes.py @@ -130,7 +130,7 @@ def volume_combos(self): combinations = product(volumes) with Pool(16) as executor: - for (volume,) in combinations: + for volume, in combinations: Combination( name=f"volume={volume}", test=configured_volume, diff --git a/s3/tests/export_part/feature.py b/s3/tests/export_part/feature.py index 800858ded..c03045ffd 100644 --- a/s3/tests/export_part/feature.py +++ b/s3/tests/export_part/feature.py @@ -23,4 +23,4 @@ def minio(self, uri, bucket_prefix): Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) Feature(run=load("s3.tests.export_part.datatypes", "feature")) Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) - # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) diff --git a/s3/tests/export_part/sanity.py b/s3/tests/export_part/sanity.py index c4532492a..22809de2e 100644 --- a/s3/tests/export_part/sanity.py +++ b/s3/tests/export_part/sanity.py @@ -94,7 +94,6 @@ def mismatched_columns(self): @TestScenario @Requirements( - RQ_ClickHouse_ExportPart_S3("1.0"), RQ_ClickHouse_ExportPart_SQLCommand("1.0"), ) def basic_table(self): diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 5a3490670..c1e809925 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -226,12 +226,21 @@ def get_export_events(self, node): @TestStep(When) -def drop_column(self, node, table_name, column_name): - """Drop a column from a table.""" +def get_part_log(self, node): + """Get the part log from the system.part_log table of a given node.""" + + output = node.query( + "SELECT name, value FROM system.part_log WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + exitcode=0, + steps=True, + ).output - node.query( - f"ALTER TABLE {table_name} DROP COLUMN {column_name}", exitcode=0, steps=True - ) + events = {} + for line in output.strip().splitlines(): + event = json.loads(line) + events[event["name"]] = int(event["value"]) + + return events @TestStep(Then) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 51fdce972..a134dbbcf 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -4,20 +4,9 @@ from s3.requirements.export_part import * -# TODO -# part_log is where to look -# overwrite file -# max bandwidth -# some of system.events stuff wont appear unless i set this maybe? just a guess -# system.events -# Export row in system.metrics?? -# partsexports incrementing correctly -# duplicates incrementing correctly - - @TestScenario -def part_exports(self): - """Check part exports are properly tracked in system.part_log.""" +def part_logging(self): + """Check part exports are logged correctly in both system.events and system.part_log.""" with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( @@ -29,10 +18,10 @@ def part_exports(self): s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with And("I read the initial logged number of part exports"): - initial_exports = get_export_events( - node=self.context.node - ) # .get("PartsExports", 0) - note(f"Initial exports: {initial_exports}") + initial_events = get_export_events(node=self.context.node) + initial_part_log = get_part_log(node=self.context.node) + note(f"Initial events: {initial_events}") + note(f"Initial part log: {initial_part_log}") # with When("I export parts to the S3 table"): # export_parts( @@ -54,8 +43,8 @@ def part_exports(self): @TestScenario -def duplicate_exports(self): - """Check duplicate exports are ignored and not exported again.""" +def duplicate_logging(self): + """Check duplicate exports are logged correctly in system.events.""" with Given("I create a populated source table and empty S3 table"): partitioned_merge_tree_table( @@ -66,6 +55,9 @@ def duplicate_exports(self): ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + with And("I read the initial export events"): + initial_events = get_export_events(node=self.context.node) + with When("I try to export the parts twice"): export_parts( source_table="source", @@ -78,17 +70,19 @@ def duplicate_exports(self): node=self.context.node, ) - # with And("I read the initial export events"): - with Then("Check source matches destination"): source_matches_destination( source_table="source", destination_table=s3_table_name, ) - with And("Check logs for duplicate exports"): - export_events = get_export_events(node=self.context.node) - note(export_events["PartsExports"]) + with And("Check logs for correct number of duplicate exports"): + final_events = get_export_events(node=self.context.node) + assert ( + final_events["PartsExportDuplicated"] + - initial_events["PartsExportDuplicated"] + == 5 + ), error() @TestFeature @@ -97,5 +91,15 @@ def duplicate_exports(self): def feature(self): """Check system monitoring of export events.""" - # Scenario(run=part_exports) - Scenario(run=duplicate_exports) + # TODO + # part_log is where to look + # overwrite file + # max bandwidth + # some of system.events stuff wont appear unless i set this maybe? just a guess + # system.events + # Export row in system.metrics?? + # partsexports incrementing correctly + # duplicates incrementing correctly + + Scenario(run=part_logging) + # Scenario(run=duplicate_logging) From cd192a50ec924019dc7fe8f3bd502dcc16a30567 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 17:04:47 -0500 Subject: [PATCH 80/99] black --- s3/tests/export_part/engines_volumes.py | 2 +- s3/tests/export_part/steps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/s3/tests/export_part/engines_volumes.py b/s3/tests/export_part/engines_volumes.py index 8a3c4cf6a..150f409f8 100644 --- a/s3/tests/export_part/engines_volumes.py +++ b/s3/tests/export_part/engines_volumes.py @@ -130,7 +130,7 @@ def volume_combos(self): combinations = product(volumes) with Pool(16) as executor: - for volume, in combinations: + for (volume,) in combinations: Combination( name=f"volume={volume}", test=configured_volume, diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index c1e809925..294a572ae 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -228,7 +228,7 @@ def get_export_events(self, node): @TestStep(When) def get_part_log(self, node): """Get the part log from the system.part_log table of a given node.""" - + output = node.query( "SELECT name, value FROM system.part_log WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", exitcode=0, From 01228444e9f31bd690bde2208bca33f14014440d Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 17:24:34 -0500 Subject: [PATCH 81/99] Basic system logs --- s3/tests/export_part/steps.py | 11 ++----- s3/tests/export_part/system_monitoring.py | 40 ++++++++++------------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 294a572ae..21cd80537 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -230,17 +230,12 @@ def get_part_log(self, node): """Get the part log from the system.part_log table of a given node.""" output = node.query( - "SELECT name, value FROM system.part_log WHERE name LIKE '%%Export%%' FORMAT JSONEachRow", + "SELECT part_name FROM system.part_log WHERE event_type = 'ExportPart'", exitcode=0, steps=True, - ).output - - events = {} - for line in output.strip().splitlines(): - event = json.loads(line) - events[event["name"]] = int(event["value"]) + ).output.splitlines() - return events + return output @TestStep(Then) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index a134dbbcf..14db26b42 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -17,29 +17,27 @@ def part_logging(self): ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) - with And("I read the initial logged number of part exports"): + with And("I read the initial logged export events"): initial_events = get_export_events(node=self.context.node) - initial_part_log = get_part_log(node=self.context.node) - note(f"Initial events: {initial_events}") - note(f"Initial part log: {initial_part_log}") - # with When("I export parts to the S3 table"): - # export_parts( - # source_table="source", - # destination_table=s3_table_name, - # node=self.context.node, - # ) - - # with And("I read the final logged number of part exports"): - # final_exports = get_export_events(node=self.context.node).get("PartsExports", 0) - - # with Then("I check that the number of part exports is correct"): + with When("I export parts to the S3 table"): + export_parts( + source_table="source", + destination_table=s3_table_name, + node=self.context.node, + ) - # with By("Reading the number of parts for the source table"): - # num_parts = len(get_parts(table_name="source", node=self.context.node)) + with And("I read the final logged export events and part log"): + final_events = get_export_events(node=self.context.node) + part_log = get_part_log(node=self.context.node) - # with And("Checking that the before and after difference is correct"): - # assert final_exports - initial_exports == num_parts, error() + with Then("I check that the number of part exports is correct"): + assert final_events["PartsExports"] - initial_events["PartsExports"] == 5, error() + + with And("I check that the part log contains the correct parts"): + parts = get_parts(table_name="source", node=self.context.node) + for part in parts: + assert part in part_log, error() @TestScenario @@ -98,8 +96,6 @@ def feature(self): # some of system.events stuff wont appear unless i set this maybe? just a guess # system.events # Export row in system.metrics?? - # partsexports incrementing correctly - # duplicates incrementing correctly Scenario(run=part_logging) - # Scenario(run=duplicate_logging) + Scenario(run=duplicate_logging) From d6ee3329a794322855b57d37344c3a7211072b75 Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 17:26:16 -0500 Subject: [PATCH 82/99] Clean up --- s3/tests/export_part/system_monitoring.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 14db26b42..f69d6778a 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -89,13 +89,5 @@ def duplicate_logging(self): def feature(self): """Check system monitoring of export events.""" - # TODO - # part_log is where to look - # overwrite file - # max bandwidth - # some of system.events stuff wont appear unless i set this maybe? just a guess - # system.events - # Export row in system.metrics?? - Scenario(run=part_logging) Scenario(run=duplicate_logging) From b1076cba2d459a5c40fcc1ee0c76b4c50b56e96e Mon Sep 17 00:00:00 2001 From: julian Date: Thu, 6 Nov 2025 17:26:28 -0500 Subject: [PATCH 83/99] black --- s3/tests/export_part/system_monitoring.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index f69d6778a..f8cd333fb 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -32,8 +32,10 @@ def part_logging(self): part_log = get_part_log(node=self.context.node) with Then("I check that the number of part exports is correct"): - assert final_events["PartsExports"] - initial_events["PartsExports"] == 5, error() - + assert ( + final_events["PartsExports"] - initial_events["PartsExports"] == 5 + ), error() + with And("I check that the part log contains the correct parts"): parts = get_parts(table_name="source", node=self.context.node) for part in parts: From 58a3827356c5933ea6c20a266bcf68c422b8ac98 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Fri, 7 Nov 2025 16:12:09 +0400 Subject: [PATCH 84/99] bring back cluster list for minio --- s3/regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/s3/regression.py b/s3/regression.py index 5a24598d3..97f8c6cdf 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -566,8 +566,8 @@ def minio_regression( # with And("I add all possible clusters for nodes"): # add_clusters_for_nodes(nodes=nodes["clickhouse"], modify=True) - # with And("I get all possible clusters for nodes"): - # self.context.clusters = get_clusters_for_nodes(nodes=nodes["clickhouse"]) + with And("I get all possible clusters for nodes"): + self.context.clusters = get_clusters_for_nodes(nodes=nodes["clickhouse"]) with Feature("part 1"): Feature(test=load("s3.tests.sanity", "minio"))(uri=uri_bucket_file) From a8c07bea258380d6e63cd34523106b34d56be426 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 09:45:51 -0500 Subject: [PATCH 85/99] Move minio steps --- s3/tests/export_part/concurrency_networks.py | 61 -------------------- s3/tests/export_part/steps.py | 61 ++++++++++++++++++++ s3/tests/export_part/system_monitoring.py | 10 ++++ 3 files changed, 71 insertions(+), 61 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index e88dd42dc..503924f50 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -333,67 +333,6 @@ def concurrent_insert(self): assert len(source_data) == 15, error() -@TestStep(When) -def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): - """Forcefully kill MinIO container to simulate network crash.""" - - if cluster is None: - cluster = self.context.cluster - - retry(cluster.command, 5)( - None, - f"docker kill --signal={signal} {container_name}", - timeout=60, - exitcode=0, - steps=False, - ) - - if signal == "TERM": - with And("waiting for MinIO container to stop"): - for attempt in retries(timeout=30, delay=1): - with attempt: - result = cluster.command( - None, - f"docker ps --filter name={container_name} --format '{{{{.Names}}}}'", - timeout=10, - steps=False, - no_checks=True, - ) - if container_name not in result.output: - break - fail("MinIO container still running") - - -@TestStep(When) -def start_minio(self, cluster=None, container_name="s3_env-minio1-1", timeout=300): - """Start MinIO container and wait for it to be ready.""" - - if cluster is None: - cluster = self.context.cluster - - with By("starting MinIO container"): - retry(cluster.command, 5)( - None, - f"docker start {container_name}", - timeout=timeout, - exitcode=0, - steps=True, - ) - - with And("waiting for MinIO to be ready"): - for attempt in retries(timeout=timeout, delay=1): - with attempt: - result = cluster.command( - None, - f"docker exec {container_name} curl -f http://localhost:9001/minio/health/live", - timeout=10, - steps=False, - no_checks=True, - ) - if result.exitcode != 0: - fail("MinIO health check failed") - - @TestScenario @Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_DestinationInterruption("1.0")) def minio_network_interruption(self, number_of_values=3, signal="KILL"): diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index 21cd80537..cf483b67f 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -151,6 +151,67 @@ def create_s3_table( return table_name +@TestStep(When) +def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): + """Forcefully kill MinIO container to simulate network crash.""" + + if cluster is None: + cluster = self.context.cluster + + retry(cluster.command, 5)( + None, + f"docker kill --signal={signal} {container_name}", + timeout=60, + exitcode=0, + steps=False, + ) + + if signal == "TERM": + with And("Waiting for MinIO container to stop"): + for attempt in retries(timeout=30, delay=1): + with attempt: + result = cluster.command( + None, + f"docker ps --filter name={container_name} --format '{{{{.Names}}}}'", + timeout=10, + steps=False, + no_checks=True, + ) + if container_name not in result.output: + break + fail("MinIO container still running") + + +@TestStep(When) +def start_minio(self, cluster=None, container_name="s3_env-minio1-1"): + """Start MinIO container and wait for it to be ready.""" + + if cluster is None: + cluster = self.context.cluster + + with By("Starting MinIO container"): + retry(cluster.command, 5)( + None, + f"docker start {container_name}", + timeout=60, + exitcode=0, + steps=True, + ) + + with And("Waiting for MinIO to be ready"): + for attempt in retries(timeout=30, delay=1): + with attempt: + result = cluster.command( + None, + f"docker exec {container_name} curl -f http://localhost:9001/minio/health/live", + timeout=10, + steps=False, + no_checks=True, + ) + if result.exitcode != 0: + fail("MinIO health check failed") + + @TestStep(When) def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index f8cd333fb..b396da234 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -85,6 +85,16 @@ def duplicate_logging(self): ), error() +@TestScenario +def background_move_pool_size(self): + pass + + +@TestScenario +def system_exports_logging(self): + pass + + @TestFeature @Name("system monitoring") @Requirements(RQ_ClickHouse_ExportPart_Logging("1.0")) From b15049bafbf494ee8dac4da1d5475c7298c416a8 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 18:28:29 -0500 Subject: [PATCH 86/99] Concurrent export verification via part_log --- s3/tests/export_part/concurrency_networks.py | 29 ++++++++------- s3/tests/export_part/steps.py | 38 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index 503924f50..d5c8b67ff 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -8,23 +8,25 @@ @TestScenario @Requirements(RQ_ClickHouse_ExportPart_Concurrency("1.0")) -def basic_concurrent_export(self, threads): +def concurrent_export(self, num_tables): """Check concurrent exports from different sources to the same S3 table.""" - with Given(f"I create {threads} populated source tables and an empty S3 table"): - for i in range(threads): - partitioned_merge_tree_table( - table_name=f"source{i}", + with Given(f"I create {num_tables} populated source tables and an empty S3 table"): + source_tables = [] + for i in range(num_tables): + source_tables.append(partitioned_merge_tree_table( + table_name=f"source_{getuid()}", partition_by="p", - columns=default_columns(), - stop_merges=True, + columns=default_columns(), + stop_merges=True, + ) ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) with When("I export parts from all sources concurrently to the S3 table"): - for i in range(threads): + for i in range(num_tables): Step(test=export_parts, parallel=True)( - source_table=f"source{i}", + source_table=source_tables[i], destination_table=s3_table_name, node=self.context.node, ) @@ -32,8 +34,8 @@ def basic_concurrent_export(self, threads): with And("I read data from all tables"): source_data = [] - for i in range(threads): - data = select_all_ordered(table_name=f"source{i}", node=self.context.node) + for i in range(num_tables): + data = select_all_ordered(table_name=source_tables[i], node=self.context.node) source_data.extend(data) destination_data = select_all_ordered( table_name=s3_table_name, node=self.context.node @@ -42,6 +44,9 @@ def basic_concurrent_export(self, threads): with Then("All data should be present in the S3 table"): assert set(source_data) == set(destination_data), error() + with And("Exports should have run concurrently"): + verify_export_concurrency(node=self.context.node, source_tables=source_tables) + @TestScenario @Requirements(RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues("1.0")) @@ -431,7 +436,7 @@ def feature(self): # TODO corruption (bit flipping) - Scenario(test=basic_concurrent_export)(threads=5) + Scenario(test=concurrent_export)(num_tables=5) Scenario(test=packet_delay)(delay_ms=100) Scenario(test=packet_loss)(percent_loss=50) Scenario(test=packet_loss_gemodel)( diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index cf483b67f..dd9f98eb9 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -315,3 +315,41 @@ def source_matches_destination( table_name=destination_table, node=destination_node ) assert source_data == destination_data, error() + + +@TestStep(Then) +def verify_export_concurrency(self, node, source_tables): + """Verify exports from different tables ran concurrently by checking overlapping execution times. + + Checks that for each table, there's at least one pair of consecutive exports from that table + with an export from another table in between, confirming concurrent execution. + """ + + table_filter = " OR ".join([f"table = '{table}'" for table in source_tables]) + + query = f""" + SELECT + table + FROM system.part_log + WHERE event_type = 'ExportPart' + AND ({table_filter}) + ORDER BY event_time_microseconds + """ + + result = node.query(query, exitcode=0, steps=True) + + exports = [line for line in result.output.strip().splitlines()] + + tables_done = set() + + for i in range(len(exports) - 1): + current_table = exports[i] + next_table = exports[i + 1] + + if current_table != next_table and current_table not in tables_done: + for j in range(i + 2, len(exports)): + if exports[j] == current_table: + tables_done.add(current_table) + break + + assert len(tables_done) == len(source_tables), error() \ No newline at end of file From af58375756ea6d27780583a72957c8bb244c2aa0 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 20:12:57 -0500 Subject: [PATCH 87/99] System.exports logging --- s3/tests/export_part/concurrency_networks.py | 11 +++--- s3/tests/export_part/steps.py | 29 ++++++++++----- s3/tests/export_part/system_monitoring.py | 37 ++++++++++++++++++-- 3 files changed, 62 insertions(+), 15 deletions(-) diff --git a/s3/tests/export_part/concurrency_networks.py b/s3/tests/export_part/concurrency_networks.py index d5c8b67ff..31f731ee0 100644 --- a/s3/tests/export_part/concurrency_networks.py +++ b/s3/tests/export_part/concurrency_networks.py @@ -14,9 +14,10 @@ def concurrent_export(self, num_tables): with Given(f"I create {num_tables} populated source tables and an empty S3 table"): source_tables = [] for i in range(num_tables): - source_tables.append(partitioned_merge_tree_table( - table_name=f"source_{getuid()}", - partition_by="p", + source_tables.append( + partitioned_merge_tree_table( + table_name=f"source_{getuid()}", + partition_by="p", columns=default_columns(), stop_merges=True, ) @@ -35,7 +36,9 @@ def concurrent_export(self, num_tables): with And("I read data from all tables"): source_data = [] for i in range(num_tables): - data = select_all_ordered(table_name=source_tables[i], node=self.context.node) + data = select_all_ordered( + table_name=source_tables[i], node=self.context.node + ) source_data.extend(data) destination_data = select_all_ordered( table_name=s3_table_name, node=self.context.node diff --git a/s3/tests/export_part/steps.py b/s3/tests/export_part/steps.py index dd9f98eb9..0bea888c8 100644 --- a/s3/tests/export_part/steps.py +++ b/s3/tests/export_part/steps.py @@ -299,6 +299,19 @@ def get_part_log(self, node): return output +@TestStep(When) +def get_system_exports(self, node): + """Get the system.exports source and destination table columns for all ongoing exports.""" + + exports = node.query( + "SELECT source_table, destination_table FROM system.exports", + exitcode=0, + steps=True, + ).output.splitlines() + + return [line.strip().split("\t") for line in exports] + + @TestStep(Then) def source_matches_destination( self, source_table, destination_table, source_node=None, destination_node=None @@ -320,13 +333,13 @@ def source_matches_destination( @TestStep(Then) def verify_export_concurrency(self, node, source_tables): """Verify exports from different tables ran concurrently by checking overlapping execution times. - + Checks that for each table, there's at least one pair of consecutive exports from that table with an export from another table in between, confirming concurrent execution. """ table_filter = " OR ".join([f"table = '{table}'" for table in source_tables]) - + query = f""" SELECT table @@ -335,21 +348,21 @@ def verify_export_concurrency(self, node, source_tables): AND ({table_filter}) ORDER BY event_time_microseconds """ - + result = node.query(query, exitcode=0, steps=True) - + exports = [line for line in result.output.strip().splitlines()] - + tables_done = set() - + for i in range(len(exports) - 1): current_table = exports[i] next_table = exports[i + 1] - + if current_table != next_table and current_table not in tables_done: for j in range(i + 2, len(exports)): if exports[j] == current_table: tables_done.add(current_table) break - assert len(tables_done) == len(source_tables), error() \ No newline at end of file + assert len(tables_done) == len(source_tables), error() diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index b396da234..2b15c4d3e 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -2,9 +2,11 @@ from testflows.asserts import error from s3.tests.export_part.steps import * from s3.requirements.export_part import * +from time import sleep @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Logging("1.0")) def part_logging(self): """Check part exports are logged correctly in both system.events and system.part_log.""" @@ -86,12 +88,40 @@ def duplicate_logging(self): @TestScenario -def background_move_pool_size(self): - pass +def system_exports_logging(self): + """Check that system.exports table tracks export operations before they complete.""" + + with Given( + "I create a populated source table with large enough parts and empty S3 table" + ): + source_table = partitioned_merge_tree_table( + table_name=f"source_{getuid()}", + partition_by="p", + columns=default_columns(), + stop_merges=True, + number_of_values=1000000, + ) + s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with When("I export parts to the S3 table"): + export_parts( + source_table=source_table, + destination_table=s3_table_name, + node=self.context.node, + ) + + with Then("I check that system.exports contains some relevant parts"): + exports = get_system_exports(node=self.context.node) + assert len(exports) > 0, error() + assert [source_table, s3_table_name] in exports, error() + + with And("I verify that system.exports empties after exports complete"): + sleep(5) + assert len(get_system_exports(node=self.context.node)) == 0, error() @TestScenario -def system_exports_logging(self): +def background_move_pool_size(self): pass @@ -103,3 +133,4 @@ def feature(self): Scenario(run=part_logging) Scenario(run=duplicate_logging) + Scenario(run=system_exports_logging) From 35c13d772f1572c8a5c59c69f469492fb508826a Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 20:38:17 -0500 Subject: [PATCH 88/99] Requirements update --- s3/requirements/export_part.md | 63 +++-------- s3/requirements/export_part.py | 128 ++++++---------------- s3/tests/export_part/system_monitoring.py | 2 + 3 files changed, 51 insertions(+), 142 deletions(-) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index b65ae40b9..910983769 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -351,10 +351,9 @@ version: 1.0 version: 1.0 [ClickHouse] SHALL validate source part availability by: - * Checking that the specified part exists in the source table * Verifying the part is in an active state (not detached or missing) -* Throwing a `NO_SUCH_DATA_PART` exception with message "No such data part '{}' to export in table '{}'" when the part is not found +* Throwing an exception with message containing "Unexpected part name" when the part is not found * Performing this validation before creating the export manifest ## Export operation concurrency @@ -377,12 +376,11 @@ version: 1.0 ### RQ.ClickHouse.ExportPart.Idempotency version: 1.0 -[ClickHouse] SHALL ensure export operations are idempotent by: - -* Allowing the same part to be exported multiple times safely without data corruption -* Supporting file overwrite control through the `export_merge_tree_part_overwrite_file_if_exists` setting -* Generating unique file names using part name and checksum to avoid conflicts -* Maintaining export state consistency across retries +[ClickHouse] SHALL handle duplicate export operations by: +* Preventing duplicate data from being exported when the same part is exported multiple times to the same destination +* Detecting when an export operation attempts to export a part that already exists in the destination +* Logging duplicate export attempts in the `system.events` table with the `PartsExportDuplicated` counter +* Ensuring that destination data matches source data without duplication when the same part is exported multiple times ## Export operation logging @@ -391,34 +389,12 @@ version: 1.0 [ClickHouse] SHALL provide detailed logging for export operations by: * Logging all export operations (both successful and failed) with timestamps and details -* Recording the specific part name and destination for all operations -* Including execution time and progress information for all operations -* Writing operation information to the `system.part_log` table with the following columns: - * `hostname` - Hostname of the server where the export operation occurred - * `query_id` - Query ID of the export operation - * `event_type` - Set to `EXPORT_PART` for export operations - * `event_date` - Date when the export operation occurred - * `event_time` - Timestamp when the export operation occurred - * `event_time_microseconds` - Timestamp with microsecond precision - * `duration_ms` - Execution time in milliseconds - * `database` - Source database name - * `table` - Source table name - * `table_uuid` - UUID of the source table - * `part_name` - Name of the part being exported - * `partition_id` - Partition ID of the part being exported - * `partition` - Partition name of the part being exported - * `part_type` - Type of the part (e.g., Wide, Compact) - * `disk_name` - Name of the disk where the part is stored - * `path_on_disk` - Path to the part in source storage - * `rows` - Number of rows in the part - * `size_in_bytes` - Size of the part in bytes - * `bytes_uncompressed` - Uncompressed size of the part in bytes - * `read_rows` - Number of rows read during export - * `read_bytes` - Number of bytes read during export - * `peak_memory_usage` - Peak memory usage during the export operation - * `error` - Error message if the export failed (empty for successful exports) - * `exception` - Exception details if the export failed - * `ProfileEvents` - Profile events collected during the export operation +* Recording the specific part name in the `system.part_log` table for all operations +* Logging export events in the `system.events` table, including: + * `PartsExports` - Number of successful part exports + * `PartsExportFailures` - Number of failed part exports + * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PART` * Providing sufficient detail for monitoring and troubleshooting export operations ## Monitoring export operations @@ -426,18 +402,11 @@ version: 1.0 ### RQ.ClickHouse.ExportPart.SystemTables.Exports version: 1.0 -[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active and completed export operations, track progress metrics, performance statistics, and troubleshoot export issues with the following columns: +[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active export operations with at least the following columns: +* `source_table` - source table identifier +* `destination_table` - destination table identifier -* `source_database`, `source_table` - source table identifiers -* `destination_database`, `destination_table` - destination table identifiers -* `create_time` - when export was submitted -* `part_name` - name of the exported part -* `destination_file_path` - path in destination storage -* `elapsed` - execution time in seconds -* `rows_read`, `total_rows_to_read` - progress metrics -* `total_size_bytes_compressed`, `total_size_bytes_uncompressed` - size metrics -* `bytes_read_uncompressed` - bytes processed -* `memory_usage`, `peak_memory_usage` - memory consumption +The table SHALL track export operations before they complete and SHALL be empty after all exports complete. ## Enabling export functionality diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index 44bc09d63..9fc23bf81 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -503,10 +503,9 @@ uid=None, description=( "[ClickHouse] SHALL validate source part availability by:\n" - "\n" "* Checking that the specified part exists in the source table\n" "* Verifying the part is in an active state (not detached or missing)\n" - "* Throwing a `NO_SUCH_DATA_PART` exception with message \"No such data part '{}' to export in table '{}'\" when the part is not found\n" + '* Throwing an exception with message containing "Unexpected part name" when the part is not found\n' "* Performing this validation before creating the export manifest\n" "\n" ), @@ -547,12 +546,11 @@ type=None, uid=None, description=( - "[ClickHouse] SHALL ensure export operations are idempotent by:\n" - "\n" - "* Allowing the same part to be exported multiple times safely without data corruption\n" - "* Supporting file overwrite control through the `export_merge_tree_part_overwrite_file_if_exists` setting\n" - "* Generating unique file names using part name and checksum to avoid conflicts\n" - "* Maintaining export state consistency across retries\n" + "[ClickHouse] SHALL handle duplicate export operations by:\n" + "* Preventing duplicate data from being exported when the same part is exported multiple times to the same destination\n" + "* Detecting when an export operation attempts to export a part that already exists in the destination\n" + "* Logging duplicate export attempts in the `system.events` table with the `PartsExportDuplicated` counter\n" + "* Ensuring that destination data matches source data without duplication when the same part is exported multiple times\n" "\n" ), link=None, @@ -570,34 +568,12 @@ description=( "[ClickHouse] SHALL provide detailed logging for export operations by:\n" "* Logging all export operations (both successful and failed) with timestamps and details\n" - "* Recording the specific part name and destination for all operations\n" - "* Including execution time and progress information for all operations\n" - "* Writing operation information to the `system.part_log` table with the following columns:\n" - " * `hostname` - Hostname of the server where the export operation occurred\n" - " * `query_id` - Query ID of the export operation\n" - " * `event_type` - Set to `EXPORT_PART` for export operations\n" - " * `event_date` - Date when the export operation occurred\n" - " * `event_time` - Timestamp when the export operation occurred\n" - " * `event_time_microseconds` - Timestamp with microsecond precision\n" - " * `duration_ms` - Execution time in milliseconds\n" - " * `database` - Source database name\n" - " * `table` - Source table name\n" - " * `table_uuid` - UUID of the source table\n" - " * `part_name` - Name of the part being exported\n" - " * `partition_id` - Partition ID of the part being exported\n" - " * `partition` - Partition name of the part being exported\n" - " * `part_type` - Type of the part (e.g., Wide, Compact)\n" - " * `disk_name` - Name of the disk where the part is stored\n" - " * `path_on_disk` - Path to the part in source storage\n" - " * `rows` - Number of rows in the part\n" - " * `size_in_bytes` - Size of the part in bytes\n" - " * `bytes_uncompressed` - Uncompressed size of the part in bytes\n" - " * `read_rows` - Number of rows read during export\n" - " * `read_bytes` - Number of bytes read during export\n" - " * `peak_memory_usage` - Peak memory usage during the export operation\n" - " * `error` - Error message if the export failed (empty for successful exports)\n" - " * `exception` - Exception details if the export failed\n" - " * `ProfileEvents` - Profile events collected during the export operation\n" + "* Recording the specific part name in the `system.part_log` table for all operations\n" + "* Logging export events in the `system.events` table, including:\n" + " * `PartsExports` - Number of successful part exports\n" + " * `PartsExportFailures` - Number of failed part exports\n" + " * `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" + "* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PART`\n" "* Providing sufficient detail for monitoring and troubleshooting export operations\n" "\n" ), @@ -614,18 +590,11 @@ type=None, uid=None, description=( - "[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active and completed export operations, track progress metrics, performance statistics, and troubleshoot export issues with the following columns:\n" + "[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active export operations with at least the following columns:\n" + "* `source_table` - source table identifier\n" + "* `destination_table` - destination table identifier\n" "\n" - "* `source_database`, `source_table` - source table identifiers\n" - "* `destination_database`, `destination_table` - destination table identifiers \n" - "* `create_time` - when export was submitted\n" - "* `part_name` - name of the exported part\n" - "* `destination_file_path` - path in destination storage\n" - "* `elapsed` - execution time in seconds\n" - "* `rows_read`, `total_rows_to_read` - progress metrics\n" - "* `total_size_bytes_compressed`, `total_size_bytes_uncompressed` - size metrics\n" - "* `bytes_read_uncompressed` - bytes processed\n" - "* `memory_usage`, `peak_memory_usage` - memory consumption\n" + "The table SHALL track export operations before they complete and SHALL be empty after all exports complete.\n" "\n" ), link=None, @@ -1288,10 +1257,9 @@ version: 1.0 [ClickHouse] SHALL validate source part availability by: - * Checking that the specified part exists in the source table * Verifying the part is in an active state (not detached or missing) -* Throwing a `NO_SUCH_DATA_PART` exception with message "No such data part '{}' to export in table '{}'" when the part is not found +* Throwing an exception with message containing "Unexpected part name" when the part is not found * Performing this validation before creating the export manifest ## Export operation concurrency @@ -1314,12 +1282,11 @@ ### RQ.ClickHouse.ExportPart.Idempotency version: 1.0 -[ClickHouse] SHALL ensure export operations are idempotent by: - -* Allowing the same part to be exported multiple times safely without data corruption -* Supporting file overwrite control through the `export_merge_tree_part_overwrite_file_if_exists` setting -* Generating unique file names using part name and checksum to avoid conflicts -* Maintaining export state consistency across retries +[ClickHouse] SHALL handle duplicate export operations by: +* Preventing duplicate data from being exported when the same part is exported multiple times to the same destination +* Detecting when an export operation attempts to export a part that already exists in the destination +* Logging duplicate export attempts in the `system.events` table with the `PartsExportDuplicated` counter +* Ensuring that destination data matches source data without duplication when the same part is exported multiple times ## Export operation logging @@ -1328,34 +1295,12 @@ [ClickHouse] SHALL provide detailed logging for export operations by: * Logging all export operations (both successful and failed) with timestamps and details -* Recording the specific part name and destination for all operations -* Including execution time and progress information for all operations -* Writing operation information to the `system.part_log` table with the following columns: - * `hostname` - Hostname of the server where the export operation occurred - * `query_id` - Query ID of the export operation - * `event_type` - Set to `EXPORT_PART` for export operations - * `event_date` - Date when the export operation occurred - * `event_time` - Timestamp when the export operation occurred - * `event_time_microseconds` - Timestamp with microsecond precision - * `duration_ms` - Execution time in milliseconds - * `database` - Source database name - * `table` - Source table name - * `table_uuid` - UUID of the source table - * `part_name` - Name of the part being exported - * `partition_id` - Partition ID of the part being exported - * `partition` - Partition name of the part being exported - * `part_type` - Type of the part (e.g., Wide, Compact) - * `disk_name` - Name of the disk where the part is stored - * `path_on_disk` - Path to the part in source storage - * `rows` - Number of rows in the part - * `size_in_bytes` - Size of the part in bytes - * `bytes_uncompressed` - Uncompressed size of the part in bytes - * `read_rows` - Number of rows read during export - * `read_bytes` - Number of bytes read during export - * `peak_memory_usage` - Peak memory usage during the export operation - * `error` - Error message if the export failed (empty for successful exports) - * `exception` - Exception details if the export failed - * `ProfileEvents` - Profile events collected during the export operation +* Recording the specific part name in the `system.part_log` table for all operations +* Logging export events in the `system.events` table, including: + * `PartsExports` - Number of successful part exports + * `PartsExportFailures` - Number of failed part exports + * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PART` * Providing sufficient detail for monitoring and troubleshooting export operations ## Monitoring export operations @@ -1363,18 +1308,11 @@ ### RQ.ClickHouse.ExportPart.SystemTables.Exports version: 1.0 -[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active and completed export operations, track progress metrics, performance statistics, and troubleshoot export issues with the following columns: - -* `source_database`, `source_table` - source table identifiers -* `destination_database`, `destination_table` - destination table identifiers -* `create_time` - when export was submitted -* `part_name` - name of the exported part -* `destination_file_path` - path in destination storage -* `elapsed` - execution time in seconds -* `rows_read`, `total_rows_to_read` - progress metrics -* `total_size_bytes_compressed`, `total_size_bytes_uncompressed` - size metrics -* `bytes_read_uncompressed` - bytes processed -* `memory_usage`, `peak_memory_usage` - memory consumption +[ClickHouse] SHALL provide a `system.exports` table that allows users to monitor active export operations with at least the following columns: +* `source_table` - source table identifier +* `destination_table` - destination table identifier + +The table SHALL track export operations before they complete and SHALL be empty after all exports complete. ## Enabling export functionality diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 2b15c4d3e..0d69d2ae5 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -45,6 +45,7 @@ def part_logging(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_Idempotency("1.0")) def duplicate_logging(self): """Check duplicate exports are logged correctly in system.events.""" @@ -88,6 +89,7 @@ def duplicate_logging(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_SystemTables_Exports("1.0")) def system_exports_logging(self): """Check that system.exports table tracks export operations before they complete.""" From f90515e67e2dc2678b3987b848cff4577b3f614a Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 20:50:45 -0500 Subject: [PATCH 89/99] Revert commented out regression code --- s3/regression.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/s3/regression.py b/s3/regression.py index 97f8c6cdf..274a168dd 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -552,19 +552,19 @@ def minio_regression( for node in nodes["clickhouse"]: experimental_analyzer(node=cluster.node(node), with_analyzer=with_analyzer) - # with And("I install tc-netem on all clickhouse nodes"): - # for node in self.context.nodes: - # node.command("apt install --yes iproute2 procps") - - # with And("allow higher cpu_wait_ratio "): - # if check_clickhouse_version(">=25.4")(self): - # allow_higher_cpu_wait_ratio( - # min_os_cpu_wait_time_ratio_to_throw=15, - # max_os_cpu_wait_time_ratio_to_throw=25, - # ) - - # with And("I add all possible clusters for nodes"): - # add_clusters_for_nodes(nodes=nodes["clickhouse"], modify=True) + with And("I install tc-netem on all clickhouse nodes"): + for node in self.context.nodes: + node.command("apt install --yes iproute2 procps") + + with And("allow higher cpu_wait_ratio "): + if check_clickhouse_version(">=25.4")(self): + allow_higher_cpu_wait_ratio( + min_os_cpu_wait_time_ratio_to_throw=15, + max_os_cpu_wait_time_ratio_to_throw=25, + ) + + with And("I add all possible clusters for nodes"): + add_clusters_for_nodes(nodes=nodes["clickhouse"], modify=True) with And("I get all possible clusters for nodes"): self.context.clusters = get_clusters_for_nodes(nodes=nodes["clickhouse"]) From af6c72bba1cd68c7e0269581865f8c76aa96ccc8 Mon Sep 17 00:00:00 2001 From: julian Date: Fri, 7 Nov 2025 21:02:39 -0500 Subject: [PATCH 90/99] Req update --- s3/requirements/export_part.md | 126 ++++----- s3/requirements/export_part.py | 328 +++++++++------------- s3/tests/export_part/system_monitoring.py | 1 + 3 files changed, 173 insertions(+), 282 deletions(-) diff --git a/s3/requirements/export_part.md b/s3/requirements/export_part.md index 910983769..c7675153a 100644 --- a/s3/requirements/export_part.md +++ b/s3/requirements/export_part.md @@ -19,56 +19,51 @@ * 7.1 [RQ.ClickHouse.ExportPart.StoragePolicies](#rqclickhouseexportpartstoragepolicies) * 8 [Supported destination table engines](#supported-destination-table-engines) * 8.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) -* 9 [Destination setup and file management](#destination-setup-and-file-management) - * 9.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) -* 10 [Export data preparation](#export-data-preparation) - * 10.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) -* 11 [Schema compatibility](#schema-compatibility) - * 11.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) -* 12 [Partition key types support](#partition-key-types-support) - * 12.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) -* 13 [Part types and content support](#part-types-and-content-support) - * 13.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) - * 13.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) - * 13.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) -* 14 [Export operation failure handling](#export-operation-failure-handling) - * 14.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) -* 15 [Network resilience](#network-resilience) - * 15.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) - * 15.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) - * 15.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) -* 16 [Export operation restrictions](#export-operation-restrictions) - * 16.1 [Preventing same table exports](#preventing-same-table-exports) - * 16.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) - * 16.2 [Destination table compatibility](#destination-table-compatibility) - * 16.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) - * 16.3 [Local table restriction](#local-table-restriction) - * 16.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) - * 16.4 [Partition key compatibility](#partition-key-compatibility) - * 16.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) - * 16.5 [Source part availability](#source-part-availability) - * 16.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) -* 17 [Export operation concurrency](#export-operation-concurrency) - * 17.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) -* 18 [Export operation idempotency](#export-operation-idempotency) - * 18.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) -* 19 [Export operation logging](#export-operation-logging) - * 19.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) -* 20 [Monitoring export operations](#monitoring-export-operations) - * 20.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) -* 21 [Enabling export functionality](#enabling-export-functionality) - * 21.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) -* 22 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 22.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) -* 23 [Export operation configuration](#export-operation-configuration) - * 23.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) -* 24 [Controlling export performance](#controlling-export-performance) - * 24.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) -* 25 [Monitoring export performance metrics](#monitoring-export-performance-metrics) - * 25.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) - * 25.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) -* 26 [Export operation security](#export-operation-security) - * 26.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) +* 9 [Schema compatibility](#schema-compatibility) + * 9.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) +* 10 [Partition key types support](#partition-key-types-support) + * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) +* 11 [Part types and content support](#part-types-and-content-support) + * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) + * 11.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) + * 11.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) +* 13 [Network resilience](#network-resilience) + * 13.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) + * 13.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) + * 13.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) +* 14 [Export operation restrictions](#export-operation-restrictions) + * 14.1 [Preventing same table exports](#preventing-same-table-exports) + * 14.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) + * 14.2 [Destination table compatibility](#destination-table-compatibility) + * 14.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) + * 14.3 [Local table restriction](#local-table-restriction) + * 14.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) + * 14.4 [Partition key compatibility](#partition-key-compatibility) + * 14.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) + * 14.5 [Source part availability](#source-part-availability) + * 14.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) +* 15 [Export operation concurrency](#export-operation-concurrency) + * 15.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) +* 16 [Export operation idempotency](#export-operation-idempotency) + * 16.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) +* 17 [Export operation logging](#export-operation-logging) + * 17.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) +* 18 [Monitoring export operations](#monitoring-export-operations) + * 18.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) +* 19 [Enabling export functionality](#enabling-export-functionality) + * 19.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) +* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 20.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) +* 21 [Export operation configuration](#export-operation-configuration) + * 21.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) +* 22 [Controlling export performance](#controlling-export-performance) + * 22.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) + * 22.2 [RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartserversettingsbackgroundmovepoolsize) + * 22.3 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) +* 23 [Export operation security](#export-operation-security) + * 23.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) ## Introduction @@ -173,27 +168,6 @@ version: 1.0 * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) * `GCS` - Google Cloud Storage (with Hive partitioning) -## Destination setup and file management - -### RQ.ClickHouse.ExportPart.DestinationSetup -version: 1.0 - -[ClickHouse] SHALL handle destination setup and file management by: -* Creating appropriate import sinks for destination storage systems -* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts -* Allowing destination storage to determine the final file path based on Hive partitioning -* Creating files in the destination storage that users can observe and access -* Providing the final destination file path in the `system.part_log` table for monitoring - -## Export data preparation - -### RQ.ClickHouse.ExportPart.DataPreparation -version: 1.0 - -[ClickHouse] SHALL prepare data for export by: -* Automatically selecting all physical columns from source table metadata -* Extracting partition key values for proper Hive partitioning in destination - ## Schema compatibility ### RQ.ClickHouse.ExportPart.SchemaCompatibility @@ -440,16 +414,10 @@ version: 1.0 [ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file. -## Monitoring export performance metrics - -### RQ.ClickHouse.ExportPart.Events +### RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize version: 1.0 -[ClickHouse] SHALL provide the following export-related events in the `system.events` table: -* `PartsExports` - Number of successful part exports -* `PartsExportFailures` - Number of failed part exports -* `PartsExportDuplicated` - Number of part exports that failed because target already exists -* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export +[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file. ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/requirements/export_part.py b/s3/requirements/export_part.py index 9fc23bf81..69f8e25a5 100644 --- a/s3/requirements/export_part.py +++ b/s3/requirements/export_part.py @@ -181,45 +181,6 @@ num="8.1", ) -RQ_ClickHouse_ExportPart_DestinationSetup = Requirement( - name="RQ.ClickHouse.ExportPart.DestinationSetup", - version="1.0", - priority=None, - group=None, - type=None, - uid=None, - description=( - "[ClickHouse] SHALL handle destination setup and file management by:\n" - "* Creating appropriate import sinks for destination storage systems\n" - "* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts\n" - "* Allowing destination storage to determine the final file path based on Hive partitioning\n" - "* Creating files in the destination storage that users can observe and access\n" - "* Providing the final destination file path in the `system.part_log` table for monitoring\n" - "\n" - ), - link=None, - level=2, - num="9.1", -) - -RQ_ClickHouse_ExportPart_DataPreparation = Requirement( - name="RQ.ClickHouse.ExportPart.DataPreparation", - version="1.0", - priority=None, - group=None, - type=None, - uid=None, - description=( - "[ClickHouse] SHALL prepare data for export by:\n" - "* Automatically selecting all physical columns from source table metadata\n" - "* Extracting partition key values for proper Hive partitioning in destination\n" - "\n" - ), - link=None, - level=2, - num="10.1", -) - RQ_ClickHouse_ExportPart_SchemaCompatibility = Requirement( name="RQ.ClickHouse.ExportPart.SchemaCompatibility", version="1.0", @@ -237,7 +198,7 @@ ), link=None, level=2, - num="11.1", + num="9.1", ) RQ_ClickHouse_ExportPart_PartitionKeyTypes = Requirement( @@ -264,7 +225,7 @@ ), link=None, level=2, - num="12.1", + num="10.1", ) RQ_ClickHouse_ExportPart_PartTypes = Requirement( @@ -287,7 +248,7 @@ ), link=None, level=2, - num="13.1", + num="11.1", ) RQ_ClickHouse_ExportPart_SchemaChangeIsolation = Requirement( @@ -307,7 +268,7 @@ ), link=None, level=2, - num="13.2", + num="11.2", ) RQ_ClickHouse_ExportPart_LargeParts = Requirement( @@ -327,7 +288,7 @@ ), link=None, level=2, - num="13.3", + num="11.3", ) RQ_ClickHouse_ExportPart_FailureHandling = Requirement( @@ -348,7 +309,7 @@ ), link=None, level=2, - num="14.1", + num="12.1", ) RQ_ClickHouse_ExportPart_NetworkResilience_PacketIssues = Requirement( @@ -371,7 +332,7 @@ ), link=None, level=2, - num="15.1", + num="13.1", ) RQ_ClickHouse_ExportPart_NetworkResilience_DestinationInterruption = Requirement( @@ -392,7 +353,7 @@ ), link=None, level=2, - num="15.2", + num="13.2", ) RQ_ClickHouse_ExportPart_NetworkResilience_NodeInterruption = Requirement( @@ -413,7 +374,7 @@ ), link=None, level=2, - num="15.3", + num="13.3", ) RQ_ClickHouse_ExportPart_Restrictions_SameTable = Requirement( @@ -432,7 +393,7 @@ ), link=None, level=3, - num="16.1.1", + num="14.1.1", ) RQ_ClickHouse_ExportPart_Restrictions_DestinationSupport = Requirement( @@ -453,7 +414,7 @@ ), link=None, level=3, - num="16.2.1", + num="14.2.1", ) RQ_ClickHouse_ExportPart_Restrictions_LocalTable = Requirement( @@ -472,7 +433,7 @@ ), link=None, level=3, - num="16.3.1", + num="14.3.1", ) RQ_ClickHouse_ExportPart_Restrictions_PartitionKey = Requirement( @@ -491,7 +452,7 @@ ), link=None, level=3, - num="16.4.1", + num="14.4.1", ) RQ_ClickHouse_ExportPart_Restrictions_SourcePart = Requirement( @@ -511,7 +472,7 @@ ), link=None, level=3, - num="16.5.1", + num="14.5.1", ) RQ_ClickHouse_ExportPart_Concurrency = Requirement( @@ -535,7 +496,7 @@ ), link=None, level=2, - num="17.1", + num="15.1", ) RQ_ClickHouse_ExportPart_Idempotency = Requirement( @@ -555,7 +516,7 @@ ), link=None, level=2, - num="18.1", + num="16.1", ) RQ_ClickHouse_ExportPart_Logging = Requirement( @@ -579,7 +540,7 @@ ), link=None, level=2, - num="19.1", + num="17.1", ) RQ_ClickHouse_ExportPart_SystemTables_Exports = Requirement( @@ -599,7 +560,7 @@ ), link=None, level=2, - num="20.1", + num="18.1", ) RQ_ClickHouse_ExportPart_Settings_AllowExperimental = Requirement( @@ -615,7 +576,7 @@ ), link=None, level=2, - num="21.1", + num="19.1", ) RQ_ClickHouse_ExportPart_Settings_OverwriteFile = Requirement( @@ -631,7 +592,7 @@ ), link=None, level=2, - num="22.1", + num="20.1", ) RQ_ClickHouse_ExportPart_ParallelFormatting = Requirement( @@ -651,7 +612,7 @@ ), link=None, level=2, - num="23.1", + num="21.1", ) RQ_ClickHouse_ExportPart_ServerSettings_MaxBandwidth = Requirement( @@ -667,27 +628,23 @@ ), link=None, level=2, - num="24.1", + num="22.1", ) -RQ_ClickHouse_ExportPart_Events = Requirement( - name="RQ.ClickHouse.ExportPart.Events", +RQ_ClickHouse_ExportPart_ServerSettings_BackgroundMovePoolSize = Requirement( + name="RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize", version="1.0", priority=None, group=None, type=None, uid=None, description=( - "[ClickHouse] SHALL provide the following export-related events in the `system.events` table:\n" - "* `PartsExports` - Number of successful part exports\n" - "* `PartsExportFailures` - Number of failed part exports \n" - "* `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" - "* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export\n" + "[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file.\n" "\n" ), link=None, level=2, - num="25.1", + num="22.2", ) RQ_ClickHouse_ExportPart_Metrics_Export = Requirement( @@ -703,7 +660,7 @@ ), link=None, level=2, - num="25.2", + num="22.3", ) RQ_ClickHouse_ExportPart_Security = Requirement( @@ -729,7 +686,7 @@ ), link=None, level=2, - num="26.1", + num="23.1", ) SRS_015_ClickHouse_Export_Part_to_S3 = Specification( @@ -765,106 +722,105 @@ Heading(name="RQ.ClickHouse.ExportPart.StoragePolicies", level=2, num="7.1"), Heading(name="Supported destination table engines", level=1, num="8"), Heading(name="RQ.ClickHouse.ExportPart.DestinationEngines", level=2, num="8.1"), - Heading(name="Destination setup and file management", level=1, num="9"), - Heading(name="RQ.ClickHouse.ExportPart.DestinationSetup", level=2, num="9.1"), - Heading(name="Export data preparation", level=1, num="10"), - Heading(name="RQ.ClickHouse.ExportPart.DataPreparation", level=2, num="10.1"), - Heading(name="Schema compatibility", level=1, num="11"), + Heading(name="Schema compatibility", level=1, num="9"), Heading( - name="RQ.ClickHouse.ExportPart.SchemaCompatibility", level=2, num="11.1" + name="RQ.ClickHouse.ExportPart.SchemaCompatibility", level=2, num="9.1" ), - Heading(name="Partition key types support", level=1, num="12"), - Heading(name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", level=2, num="12.1"), - Heading(name="Part types and content support", level=1, num="13"), - Heading(name="RQ.ClickHouse.ExportPart.PartTypes", level=2, num="13.1"), + Heading(name="Partition key types support", level=1, num="10"), + Heading(name="RQ.ClickHouse.ExportPart.PartitionKeyTypes", level=2, num="10.1"), + Heading(name="Part types and content support", level=1, num="11"), + Heading(name="RQ.ClickHouse.ExportPart.PartTypes", level=2, num="11.1"), Heading( - name="RQ.ClickHouse.ExportPart.SchemaChangeIsolation", level=2, num="13.2" + name="RQ.ClickHouse.ExportPart.SchemaChangeIsolation", level=2, num="11.2" ), - Heading(name="RQ.ClickHouse.ExportPart.LargeParts", level=2, num="13.3"), - Heading(name="Export operation failure handling", level=1, num="14"), - Heading(name="RQ.ClickHouse.ExportPart.FailureHandling", level=2, num="14.1"), - Heading(name="Network resilience", level=1, num="15"), + Heading(name="RQ.ClickHouse.ExportPart.LargeParts", level=2, num="11.3"), + Heading(name="Export operation failure handling", level=1, num="12"), + Heading(name="RQ.ClickHouse.ExportPart.FailureHandling", level=2, num="12.1"), + Heading(name="Network resilience", level=1, num="13"), Heading( name="RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues", level=2, - num="15.1", + num="13.1", ), Heading( name="RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption", level=2, - num="15.2", + num="13.2", ), Heading( name="RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption", level=2, - num="15.3", + num="13.3", ), - Heading(name="Export operation restrictions", level=1, num="16"), - Heading(name="Preventing same table exports", level=2, num="16.1"), + Heading(name="Export operation restrictions", level=1, num="14"), + Heading(name="Preventing same table exports", level=2, num="14.1"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.SameTable", level=3, - num="16.1.1", + num="14.1.1", ), - Heading(name="Destination table compatibility", level=2, num="16.2"), + Heading(name="Destination table compatibility", level=2, num="14.2"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport", level=3, - num="16.2.1", + num="14.2.1", ), - Heading(name="Local table restriction", level=2, num="16.3"), + Heading(name="Local table restriction", level=2, num="14.3"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.LocalTable", level=3, - num="16.3.1", + num="14.3.1", ), - Heading(name="Partition key compatibility", level=2, num="16.4"), + Heading(name="Partition key compatibility", level=2, num="14.4"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.PartitionKey", level=3, - num="16.4.1", + num="14.4.1", ), - Heading(name="Source part availability", level=2, num="16.5"), + Heading(name="Source part availability", level=2, num="14.5"), Heading( name="RQ.ClickHouse.ExportPart.Restrictions.SourcePart", level=3, - num="16.5.1", + num="14.5.1", ), - Heading(name="Export operation concurrency", level=1, num="17"), - Heading(name="RQ.ClickHouse.ExportPart.Concurrency", level=2, num="17.1"), - Heading(name="Export operation idempotency", level=1, num="18"), - Heading(name="RQ.ClickHouse.ExportPart.Idempotency", level=2, num="18.1"), - Heading(name="Export operation logging", level=1, num="19"), - Heading(name="RQ.ClickHouse.ExportPart.Logging", level=2, num="19.1"), - Heading(name="Monitoring export operations", level=1, num="20"), + Heading(name="Export operation concurrency", level=1, num="15"), + Heading(name="RQ.ClickHouse.ExportPart.Concurrency", level=2, num="15.1"), + Heading(name="Export operation idempotency", level=1, num="16"), + Heading(name="RQ.ClickHouse.ExportPart.Idempotency", level=2, num="16.1"), + Heading(name="Export operation logging", level=1, num="17"), + Heading(name="RQ.ClickHouse.ExportPart.Logging", level=2, num="17.1"), + Heading(name="Monitoring export operations", level=1, num="18"), Heading( - name="RQ.ClickHouse.ExportPart.SystemTables.Exports", level=2, num="20.1" + name="RQ.ClickHouse.ExportPart.SystemTables.Exports", level=2, num="18.1" ), - Heading(name="Enabling export functionality", level=1, num="21"), + Heading(name="Enabling export functionality", level=1, num="19"), Heading( name="RQ.ClickHouse.ExportPart.Settings.AllowExperimental", level=2, - num="21.1", + num="19.1", ), - Heading(name="Handling file conflicts during export", level=1, num="22"), + Heading(name="Handling file conflicts during export", level=1, num="20"), Heading( - name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", level=2, num="22.1" + name="RQ.ClickHouse.ExportPart.Settings.OverwriteFile", level=2, num="20.1" ), - Heading(name="Export operation configuration", level=1, num="23"), + Heading(name="Export operation configuration", level=1, num="21"), Heading( - name="RQ.ClickHouse.ExportPart.ParallelFormatting", level=2, num="23.1" + name="RQ.ClickHouse.ExportPart.ParallelFormatting", level=2, num="21.1" ), - Heading(name="Controlling export performance", level=1, num="24"), + Heading(name="Controlling export performance", level=1, num="22"), Heading( name="RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth", level=2, - num="24.1", + num="22.1", ), - Heading(name="Monitoring export performance metrics", level=1, num="25"), - Heading(name="RQ.ClickHouse.ExportPart.Events", level=2, num="25.1"), - Heading(name="RQ.ClickHouse.ExportPart.Metrics.Export", level=2, num="25.2"), - Heading(name="Export operation security", level=1, num="26"), - Heading(name="RQ.ClickHouse.ExportPart.Security", level=2, num="26.1"), + Heading( + name="RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize", + level=2, + num="22.2", + ), + Heading(name="RQ.ClickHouse.ExportPart.Metrics.Export", level=2, num="22.3"), + Heading(name="Export operation security", level=1, num="23"), + Heading(name="RQ.ClickHouse.ExportPart.Security", level=2, num="23.1"), ), requirements=( RQ_ClickHouse_ExportPart_S3, @@ -875,8 +831,6 @@ RQ_ClickHouse_ExportPart_SourcePartStorage, RQ_ClickHouse_ExportPart_StoragePolicies, RQ_ClickHouse_ExportPart_DestinationEngines, - RQ_ClickHouse_ExportPart_DestinationSetup, - RQ_ClickHouse_ExportPart_DataPreparation, RQ_ClickHouse_ExportPart_SchemaCompatibility, RQ_ClickHouse_ExportPart_PartitionKeyTypes, RQ_ClickHouse_ExportPart_PartTypes, @@ -899,7 +853,7 @@ RQ_ClickHouse_ExportPart_Settings_OverwriteFile, RQ_ClickHouse_ExportPart_ParallelFormatting, RQ_ClickHouse_ExportPart_ServerSettings_MaxBandwidth, - RQ_ClickHouse_ExportPart_Events, + RQ_ClickHouse_ExportPart_ServerSettings_BackgroundMovePoolSize, RQ_ClickHouse_ExportPart_Metrics_Export, RQ_ClickHouse_ExportPart_Security, ), @@ -925,56 +879,51 @@ * 7.1 [RQ.ClickHouse.ExportPart.StoragePolicies](#rqclickhouseexportpartstoragepolicies) * 8 [Supported destination table engines](#supported-destination-table-engines) * 8.1 [RQ.ClickHouse.ExportPart.DestinationEngines](#rqclickhouseexportpartdestinationengines) -* 9 [Destination setup and file management](#destination-setup-and-file-management) - * 9.1 [RQ.ClickHouse.ExportPart.DestinationSetup](#rqclickhouseexportpartdestinationsetup) -* 10 [Export data preparation](#export-data-preparation) - * 10.1 [RQ.ClickHouse.ExportPart.DataPreparation](#rqclickhouseexportpartdatapreparation) -* 11 [Schema compatibility](#schema-compatibility) - * 11.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) -* 12 [Partition key types support](#partition-key-types-support) - * 12.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) -* 13 [Part types and content support](#part-types-and-content-support) - * 13.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) - * 13.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) - * 13.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) -* 14 [Export operation failure handling](#export-operation-failure-handling) - * 14.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) -* 15 [Network resilience](#network-resilience) - * 15.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) - * 15.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) - * 15.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) -* 16 [Export operation restrictions](#export-operation-restrictions) - * 16.1 [Preventing same table exports](#preventing-same-table-exports) - * 16.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) - * 16.2 [Destination table compatibility](#destination-table-compatibility) - * 16.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) - * 16.3 [Local table restriction](#local-table-restriction) - * 16.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) - * 16.4 [Partition key compatibility](#partition-key-compatibility) - * 16.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) - * 16.5 [Source part availability](#source-part-availability) - * 16.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) -* 17 [Export operation concurrency](#export-operation-concurrency) - * 17.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) -* 18 [Export operation idempotency](#export-operation-idempotency) - * 18.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) -* 19 [Export operation logging](#export-operation-logging) - * 19.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) -* 20 [Monitoring export operations](#monitoring-export-operations) - * 20.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) -* 21 [Enabling export functionality](#enabling-export-functionality) - * 21.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) -* 22 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 22.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) -* 23 [Export operation configuration](#export-operation-configuration) - * 23.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) -* 24 [Controlling export performance](#controlling-export-performance) - * 24.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) -* 25 [Monitoring export performance metrics](#monitoring-export-performance-metrics) - * 25.1 [RQ.ClickHouse.ExportPart.Events](#rqclickhouseexportpartevents) - * 25.2 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) -* 26 [Export operation security](#export-operation-security) - * 26.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) +* 9 [Schema compatibility](#schema-compatibility) + * 9.1 [RQ.ClickHouse.ExportPart.SchemaCompatibility](#rqclickhouseexportpartschemacompatibility) +* 10 [Partition key types support](#partition-key-types-support) + * 10.1 [RQ.ClickHouse.ExportPart.PartitionKeyTypes](#rqclickhouseexportpartpartitionkeytypes) +* 11 [Part types and content support](#part-types-and-content-support) + * 11.1 [RQ.ClickHouse.ExportPart.PartTypes](#rqclickhouseexportpartparttypes) + * 11.2 [RQ.ClickHouse.ExportPart.SchemaChangeIsolation](#rqclickhouseexportpartschemachangeisolation) + * 11.3 [RQ.ClickHouse.ExportPart.LargeParts](#rqclickhouseexportpartlargeparts) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPart.FailureHandling](#rqclickhouseexportpartfailurehandling) +* 13 [Network resilience](#network-resilience) + * 13.1 [RQ.ClickHouse.ExportPart.NetworkResilience.PacketIssues](#rqclickhouseexportpartnetworkresiliencepacketissues) + * 13.2 [RQ.ClickHouse.ExportPart.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartnetworkresiliencedestinationinterruption) + * 13.3 [RQ.ClickHouse.ExportPart.NetworkResilience.NodeInterruption](#rqclickhouseexportpartnetworkresiliencenodeinterruption) +* 14 [Export operation restrictions](#export-operation-restrictions) + * 14.1 [Preventing same table exports](#preventing-same-table-exports) + * 14.1.1 [RQ.ClickHouse.ExportPart.Restrictions.SameTable](#rqclickhouseexportpartrestrictionssametable) + * 14.2 [Destination table compatibility](#destination-table-compatibility) + * 14.2.1 [RQ.ClickHouse.ExportPart.Restrictions.DestinationSupport](#rqclickhouseexportpartrestrictionsdestinationsupport) + * 14.3 [Local table restriction](#local-table-restriction) + * 14.3.1 [RQ.ClickHouse.ExportPart.Restrictions.LocalTable](#rqclickhouseexportpartrestrictionslocaltable) + * 14.4 [Partition key compatibility](#partition-key-compatibility) + * 14.4.1 [RQ.ClickHouse.ExportPart.Restrictions.PartitionKey](#rqclickhouseexportpartrestrictionspartitionkey) + * 14.5 [Source part availability](#source-part-availability) + * 14.5.1 [RQ.ClickHouse.ExportPart.Restrictions.SourcePart](#rqclickhouseexportpartrestrictionssourcepart) +* 15 [Export operation concurrency](#export-operation-concurrency) + * 15.1 [RQ.ClickHouse.ExportPart.Concurrency](#rqclickhouseexportpartconcurrency) +* 16 [Export operation idempotency](#export-operation-idempotency) + * 16.1 [RQ.ClickHouse.ExportPart.Idempotency](#rqclickhouseexportpartidempotency) +* 17 [Export operation logging](#export-operation-logging) + * 17.1 [RQ.ClickHouse.ExportPart.Logging](#rqclickhouseexportpartlogging) +* 18 [Monitoring export operations](#monitoring-export-operations) + * 18.1 [RQ.ClickHouse.ExportPart.SystemTables.Exports](#rqclickhouseexportpartsystemtablesexports) +* 19 [Enabling export functionality](#enabling-export-functionality) + * 19.1 [RQ.ClickHouse.ExportPart.Settings.AllowExperimental](#rqclickhouseexportpartsettingsallowexperimental) +* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 20.1 [RQ.ClickHouse.ExportPart.Settings.OverwriteFile](#rqclickhouseexportpartsettingsoverwritefile) +* 21 [Export operation configuration](#export-operation-configuration) + * 21.1 [RQ.ClickHouse.ExportPart.ParallelFormatting](#rqclickhouseexportpartparallelformatting) +* 22 [Controlling export performance](#controlling-export-performance) + * 22.1 [RQ.ClickHouse.ExportPart.ServerSettings.MaxBandwidth](#rqclickhouseexportpartserversettingsmaxbandwidth) + * 22.2 [RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartserversettingsbackgroundmovepoolsize) + * 22.3 [RQ.ClickHouse.ExportPart.Metrics.Export](#rqclickhouseexportpartmetricsexport) +* 23 [Export operation security](#export-operation-security) + * 23.1 [RQ.ClickHouse.ExportPart.Security](#rqclickhouseexportpartsecurity) ## Introduction @@ -1079,27 +1028,6 @@ * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) * `GCS` - Google Cloud Storage (with Hive partitioning) -## Destination setup and file management - -### RQ.ClickHouse.ExportPart.DestinationSetup -version: 1.0 - -[ClickHouse] SHALL handle destination setup and file management by: -* Creating appropriate import sinks for destination storage systems -* Generating unique file names in the format `{part_name}_{checksum_hex}` to avoid conflicts -* Allowing destination storage to determine the final file path based on Hive partitioning -* Creating files in the destination storage that users can observe and access -* Providing the final destination file path in the `system.part_log` table for monitoring - -## Export data preparation - -### RQ.ClickHouse.ExportPart.DataPreparation -version: 1.0 - -[ClickHouse] SHALL prepare data for export by: -* Automatically selecting all physical columns from source table metadata -* Extracting partition key values for proper Hive partitioning in destination - ## Schema compatibility ### RQ.ClickHouse.ExportPart.SchemaCompatibility @@ -1346,16 +1274,10 @@ [ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file. -## Monitoring export performance metrics - -### RQ.ClickHouse.ExportPart.Events +### RQ.ClickHouse.ExportPart.ServerSettings.BackgroundMovePoolSize version: 1.0 -[ClickHouse] SHALL provide the following export-related events in the `system.events` table: -* `PartsExports` - Number of successful part exports -* `PartsExportFailures` - Number of failed part exports -* `PartsExportDuplicated` - Number of part exports that failed because target already exists -* `PartsExportTotalMilliseconds` - Length of total time taken for parts to export +[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file. ### RQ.ClickHouse.ExportPart.Metrics.Export version: 1.0 diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 0d69d2ae5..49054c420 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -123,6 +123,7 @@ def system_exports_logging(self): @TestScenario +@Requirements(RQ_ClickHouse_ExportPart_ServerSettings_BackgroundMovePoolSize("1.0")) def background_move_pool_size(self): pass From 971f29a88b3a2c5b71a4385d4b6ee3ef04e92fbd Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 16:31:25 +0400 Subject: [PATCH 91/99] add requirements --- s3/requirements/export_partition.md | 579 +++++++++ s3/requirements/export_partition.py | 1736 +++++++++++++++++++++++++++ 2 files changed, 2315 insertions(+) create mode 100644 s3/requirements/export_partition.md create mode 100644 s3/requirements/export_partition.py diff --git a/s3/requirements/export_partition.md b/s3/requirements/export_partition.md new file mode 100644 index 000000000..a671fcc73 --- /dev/null +++ b/s3/requirements/export_partition.md @@ -0,0 +1,579 @@ +# SRS-016 ClickHouse Export Partition to S3 +# Software Requirements Specification + +## Table of Contents + +* 1 [Introduction](#introduction) +* 2 [Exporting Partitions to S3](#exporting-partitions-to-s3) + * 2.1 [RQ.ClickHouse.ExportPartition.S3](#rqclickhouseexportpartitions3) + * 2.2 [RQ.ClickHouse.ExportPartition.EmptyPartition](#rqclickhouseexportpartitionemptypartition) +* 3 [SQL command support](#sql-command-support) + * 3.1 [RQ.ClickHouse.ExportPartition.SQLCommand](#rqclickhouseexportpartitionsqlcommand) +* 4 [Supported source table engines](#supported-source-table-engines) + * 4.1 [RQ.ClickHouse.ExportPartition.SourceEngines](#rqclickhouseexportpartitionsourceengines) +* 5 [Cluster and node support](#cluster-and-node-support) + * 5.1 [RQ.ClickHouse.ExportPartition.ClustersNodes](#rqclickhouseexportpartitionclustersnodes) +* 6 [Supported source part storage types](#supported-source-part-storage-types) + * 6.1 [RQ.ClickHouse.ExportPartition.SourcePartStorage](#rqclickhouseexportpartitionsourcepartstorage) +* 7 [Storage policies and volumes](#storage-policies-and-volumes) + * 7.1 [RQ.ClickHouse.ExportPartition.StoragePolicies](#rqclickhouseexportpartitionstoragepolicies) +* 8 [Supported destination table engines](#supported-destination-table-engines) + * 8.1 [RQ.ClickHouse.ExportPartition.DestinationEngines](#rqclickhouseexportpartitiondestinationengines) +* 9 [Schema compatibility](#schema-compatibility) + * 9.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) +* 10 [Partition key types support](#partition-key-types-support) + * 10.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) +* 11 [Partition content support](#partition-content-support) + * 11.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) + * 11.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) + * 11.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) + * 12.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) + * 12.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) + * 12.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) + * 12.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) + * 12.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) +* 13 [Network resilience](#network-resilience) + * 13.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) + * 13.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) + * 13.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) +* 14 [Export operation restrictions](#export-operation-restrictions) + * 14.1 [Preventing same table exports](#preventing-same-table-exports) + * 14.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) + * 14.2 [Destination table compatibility](#destination-table-compatibility) + * 14.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) + * 14.3 [Local table restriction](#local-table-restriction) + * 14.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) + * 14.4 [Partition key compatibility](#partition-key-compatibility) + * 14.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) + * 14.5 [Source partition availability](#source-partition-availability) + * 14.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) +* 15 [Export operation concurrency](#export-operation-concurrency) + * 15.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) +* 16 [Export operation idempotency](#export-operation-idempotency) + * 16.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) + * 16.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) +* 17 [Export operation logging](#export-operation-logging) + * 17.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) +* 18 [Monitoring export operations](#monitoring-export-operations) + * 18.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) +* 19 [Enabling export functionality](#enabling-export-functionality) + * 19.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) +* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 20.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) +* 21 [Export operation configuration](#export-operation-configuration) + * 21.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) +* 22 [Controlling export performance](#controlling-export-performance) + * 22.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) + * 22.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) + * 22.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) +* 23 [Export operation security](#export-operation-security) + * 23.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) + * 23.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) + * 23.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) + * 23.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) + +## Introduction + +This specification defines requirements for exporting partitions (all parts within a partition) from ReplicatedMergeTree tables to S3-compatible object storage. This feature enables users to export entire partitions containing multiple data parts across cluster nodes. + +## Exporting Partitions to S3 + +### RQ.ClickHouse.ExportPartition.S3 +version: 1.0 + +[ClickHouse] SHALL support exporting partitions (all parts within a partition) from ReplicatedMergeTree engine tables to S3 object storage. The export operation SHALL export all parts that belong to the specified partition ID, ensuring complete partition data is transferred to the destination. + +### RQ.ClickHouse.ExportPartition.EmptyPartition +version: 1.0 + +[ClickHouse] SHALL support exporting from empty partitions by: +* Completing export operations successfully when the specified partition contains no parts +* Resulting in an empty destination partition when exporting from an empty source partition +* Not creating any files in destination storage when there are no parts to export in the partition +* Handling empty partitions gracefully without errors + +## SQL command support + +### RQ.ClickHouse.ExportPartition.SQLCommand +version: 1.0 + +[ClickHouse] SHALL support the following SQL command syntax for exporting partitions from ReplicatedMergeTree tables to object storage tables: + +```sql +ALTER TABLE [database.]source_table_name +EXPORT PARTITION ID 'partition_id' +TO TABLE [database.]destination_table_name +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +**Parameters:** +- `source_table_name`: Name of the source ReplicatedMergeTree table +- `partition_id`: The partition ID to export (string literal), which identifies all parts belonging to that partition +- `destination_table_name`: Name of the destination object storage table +- `allow_experimental_export_merge_tree_part`: Setting that must be set to `1` to enable this experimental feature + +This command allows users to export entire partitions in a single operation, which is more efficient than exporting individual parts and ensures all data for a partition is exported together. + +## Supported source table engines + +### RQ.ClickHouse.ExportPartition.SourceEngines +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from the following source table engines: +* `ReplicatedMergeTree` - Replicated MergeTree engine (primary use case) +* `ReplicatedSummingMergeTree` - Replicated MergeTree with automatic summation +* `ReplicatedAggregatingMergeTree` - Replicated MergeTree with pre-aggregated data +* `ReplicatedCollapsingMergeTree` - Replicated MergeTree with row versioning +* `ReplicatedVersionedCollapsingMergeTree` - Replicated CollapsingMergeTree with version tracking +* `ReplicatedGraphiteMergeTree` - Replicated MergeTree optimized for Graphite data +* All other ReplicatedMergeTree family engines + +Export partition functionality manages export operations across multiple replicas in a cluster, ensuring that parts are exported correctly and avoiding conflicts. + +## Cluster and node support + +### RQ.ClickHouse.ExportPartition.ClustersNodes +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from multiple nodes in a ReplicatedMergeTree cluster to the same destination storage, ensuring that: +* Each replica in the cluster can independently export parts from the partition that it owns locally +* All parts within a partition are exported exactly once, even when distributed across multiple replicas +* Exported data from different replicas is correctly aggregated in the destination storage +* All nodes in the cluster can read the same exported partition data from the destination +* Export operations continue to make progress even if some replicas are temporarily unavailable + +In a replicated cluster, different parts of the same partition may exist on different replicas. The system must coordinate exports across all replicas to ensure complete partition export without duplication. + +## Supported source part storage types + +### RQ.ClickHouse.ExportPartition.SourcePartStorage +version: 1.0 + +[ClickHouse] SHALL support exporting partitions regardless of the underlying storage type where the source parts are stored, including: +* **Local Disks**: Parts stored on local filesystem +* **S3/Object Storage**: Parts stored on S3 or S3-compatible object storage +* **Encrypted Disks**: Parts stored on encrypted disks (disk-level encryption) +* **Cached Disks**: Parts stored with filesystem cache enabled +* **Remote Disks**: Parts stored on HDFS, Azure Blob Storage, or Google Cloud Storage +* **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold) +* **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled + +Users should be able to export partitions regardless of where the source data is physically stored, providing flexibility in storage configurations. + +## Storage policies and volumes + +### RQ.ClickHouse.ExportPartition.StoragePolicies +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including: +* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks +* **External Volumes**: Volumes using external storage systems +* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers +* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks +* Exporting all parts in a partition regardless of which volume or disk within the storage policy contains each part +* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy + +Users may have partitions with parts distributed across different storage tiers or volumes, and the export should handle all parts regardless of their storage location. + +## Supported destination table engines + +### RQ.ClickHouse.ExportPartition.DestinationEngines +version: 1.0 + +[ClickHouse] SHALL support exporting to destination tables that: +* Support object storage engines including: + * `S3` - Amazon S3 and S3-compatible storage + * `StorageObjectStorage` - Generic object storage interface + * `HDFS` - Hadoop Distributed File System (with Hive partitioning) + * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) + * `GCS` - Google Cloud Storage (with Hive partitioning) + +Export partition is designed to move data from local or replicated storage to object storage systems for long-term storage, analytics, or data sharing purposes. + +## Schema compatibility + +### RQ.ClickHouse.ExportPartition.SchemaCompatibility +version: 1.0 + +[ClickHouse] SHALL require source and destination tables to have compatible schemas for successful export operations: +* Identical physical column schemas between source and destination +* The same partition key expression in both tables +* Compatible data types for all columns +* Matching column order and names + +Schema compatibility ensures that exported data can be correctly read from the destination table without data loss or corruption. + +## Partition key types support + +### RQ.ClickHouse.ExportPartition.PartitionKeyTypes +version: 1.0 + +[ClickHouse] SHALL support export operations for tables with partition key types that are compatible with Hive partitioning, as shown in the following table: + +| Partition Key Type | Supported | Examples | Notes | +|-------------------------|-----------|--------------------------------------------------------------------------|--------------------------------| +| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported | +| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported | +| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported | +| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported | + +[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements. + +[ClickHouse] SHALL require destination tables to support Hive partitioning, which limits the supported partition key types to Integer, Date/DateTime, and String types. Complex expressions that result in unsupported types are not supported for export operations. + +Hive partitioning is a standard way to organize data in object storage systems, making exported data compatible with various analytics tools and systems. + +## Partition content support + +### RQ.ClickHouse.ExportPartition.PartitionContent +version: 1.0 + +[ClickHouse] SHALL support export operations for partitions containing all valid MergeTree part types and their contents, including: + +| Part Type | Supported | Description | Special Features | +|-------------------|-----------|--------------------------------------------------------------|--------------------------------| +| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | +| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | + +[ClickHouse] SHALL export all parts within the specified partition, regardless of their type. The system SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL maintain data integrity in the destination storage. + +Partitions may contain a mix of different part types, and the export must handle all of them correctly to ensure complete partition export. + +### RQ.ClickHouse.ExportPartition.SchemaChangeIsolation +version: 1.0 + +[ClickHouse] SHALL ensure exported partition data is isolated from subsequent schema changes by: +* Preserving exported data exactly as it was at the time of export +* Not being affected by schema changes (column drops, renames, type changes) that occur after export +* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export +* Ensuring exported data reflects the source table state at the time of export, not the current state + +Once a partition is exported, the exported data should remain stable and not be affected by future changes to the source table schema. + +### RQ.ClickHouse.ExportPartition.LargePartitions +version: 1.0 + +[ClickHouse] SHALL support exporting large partitions by: +* Handling partitions with large numbers of parts (e.g., hundreds or thousands of parts) +* Processing partitions with large numbers of rows (e.g., billions of rows) +* Processing large data volumes efficiently during export +* Maintaining data integrity when exporting large partitions +* Completing export operations successfully regardless of partition size +* Allowing export operations to continue over extended periods of time for very large partitions + +Production systems often have partitions containing very large amounts of data, and the export must handle these efficiently without timeouts or memory issues. + +## Export operation failure handling + +### RQ.ClickHouse.ExportPartition.RetryMechanism +version: 1.0 + +[ClickHouse] SHALL automatically retry failed part exports within a partition up to a configurable maximum retry count. If all retry attempts are exhausted for a part, the entire partition export operation SHALL be marked as failed. + +Unlike single-part exports, partition exports involve multiple parts and may take significant time. Retry mechanisms ensure that temporary failures don't require restarting the entire export operation. + +### RQ.ClickHouse.ExportPartition.Settings.MaxRetries +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_max_retries` setting that controls the maximum number of retries for exporting a merge tree part in an export partition task. The default value SHALL be `3`. + +This setting allows users to control how many times the system will retry exporting a part before marking it as failed. + +### RQ.ClickHouse.ExportPartition.ResumeAfterFailure +version: 1.0 + +[ClickHouse] SHALL allow export operations to resume after node failures or restarts. The system SHALL track which parts have been successfully exported and SHALL not re-export parts that were already successfully exported. + +### RQ.ClickHouse.ExportPartition.PartialProgress +version: 1.0 + +[ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed. + +### RQ.ClickHouse.ExportPartition.Cleanup +version: 1.0 + +[ClickHouse] SHALL automatically clean up failed or completed export operations after a configurable TTL period. + +### RQ.ClickHouse.ExportPartition.Settings.ManifestTTL +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_manifest_ttl` setting that determines how long the export manifest will be retained. This setting prevents the same partition from being exported twice to the same destination within the TTL period. The default value SHALL be `180` seconds. + +This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports. + +## Network resilience + +### RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues +version: 1.0 + +[ClickHouse] SHALL handle network packet issues during export operations by: +* Tolerating packet delay without data corruption or loss +* Handling packet loss and retransmitting data as needed +* Detecting and handling packet corruption to ensure data integrity +* Managing packet duplication without data duplication in destination +* Handling packet reordering to maintain correct data sequence +* Operating correctly under packet rate limiting constraints +* Completing exports successfully despite network impairments + +Network issues are common in distributed systems, and export operations must be resilient to ensure data integrity. + +### RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption +version: 1.0 + +[ClickHouse] SHALL handle destination storage interruptions during export operations by: +* Detecting when destination storage becomes unavailable during export +* Retrying failed part exports when destination storage becomes available again +* Logging failed exports in the `system.events` table with appropriate counters +* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability +* Allowing exports to complete successfully once destination storage becomes available again +* Resuming export operations from the last successfully exported part + +Destination storage systems may experience temporary outages, and the export should automatically recover when service is restored. + +### RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption +version: 1.0 + +[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by: +* Allowing export operations to resume after node restart without data loss or duplication +* Allowing other replicas to continue or complete export operations if a node fails +* Not leaving partial or corrupted data in destination storage when node restarts occur +* With safe shutdown, ensuring exports complete successfully before node shutdown when possible +* With unsafe shutdown, allowing export operations to resume from the last checkpoint after node restart +* Maintaining data integrity in destination storage regardless of node interruption type +* Ensuring that parts already exported are not re-exported after node restart + +Node failures are common in distributed systems, and export operations must be able to recover and continue without data loss or duplication. + +## Export operation restrictions + +### Preventing same table exports + +#### RQ.ClickHouse.ExportPartition.Restrictions.SameTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting partitions to the same table as the source by: +* Validating that source and destination table identifiers are different +* Throwing a `BAD_ARGUMENTS` exception with message "Exporting to the same table is not allowed" when source and destination are identical +* Performing this validation before any export processing begins + +Exporting to the same table would be redundant and could cause data duplication or conflicts. + +### Destination table compatibility + +#### RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport +version: 1.0 + +[ClickHouse] SHALL validate destination table compatibility by: + +* Checking that the destination storage supports importing MergeTree parts +* Verifying that the destination uses Hive partitioning strategy (`partition_strategy = 'hive'`) +* Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met +* Performing this validation during the initial export setup phase + +The destination must support the format and partitioning strategy required for exported data. + +### Local table restriction + +#### RQ.ClickHouse.ExportPartition.Restrictions.LocalTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting partitions to local MergeTree tables by: +* Rejecting export operations where the destination table uses a MergeTree engine +* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table +* Performing this validation during the initial export setup phase + +Export partition is designed to move data to object storage, not to local MergeTree tables. + +### Partition key compatibility + +#### RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey +version: 1.0 + +[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by: +* Checking that the partition key expression matches between source and destination tables +* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ +* Performing this validation during the initial export setup phase + +Matching partition keys ensure that exported data is organized correctly in the destination storage. + +### Source partition availability + +#### RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition +version: 1.0 + +[ClickHouse] SHALL validate source partition availability by: +* Checking that the specified partition ID exists in the source table +* Verifying that the partition contains at least one active part (not detached or missing) +* Throwing an exception with an appropriate error message when the partition is not found or is empty +* Performing this validation before any export processing begins + +The system must verify that the partition exists and contains data before attempting to export it. + +## Export operation concurrency + +### RQ.ClickHouse.ExportPartition.Concurrency +version: 1.0 + +[ClickHouse] SHALL support concurrent export operations by: +* Allowing multiple partition exports to run simultaneously without interference +* Supporting concurrent exports of different partitions to different destinations +* Preventing concurrent exports of the same partition to the same destination +* Allowing different replicas to export different parts of the same partition concurrently +* Maintaining separate progress tracking for each concurrent operation + +Multiple users may want to export different partitions simultaneously, and the system must coordinate these operations to prevent conflicts while maximizing parallelism. + +## Export operation idempotency + +### RQ.ClickHouse.ExportPartition.Idempotency +version: 1.0 + +[ClickHouse] SHALL handle duplicate export operations by: +* Preventing duplicate data from being exported when the same partition is exported multiple times to the same destination +* Detecting when a partition export is already in progress or completed +* Detecting when an export operation attempts to export a partition that already exists in the destination +* Logging duplicate export attempts in the `system.events` table with appropriate counters +* Ensuring that destination data matches source data without duplication when the same partition is exported multiple times +* Allowing users to force re-export of a partition if needed (e.g., after TTL expiration or manual cleanup) + +Users may accidentally trigger the same export multiple times, and the system should prevent duplicate data while allowing legitimate re-exports when needed. + +### RQ.ClickHouse.ExportPartition.Settings.ForceExport +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_force_export` setting that allows users to ignore existing partition export entries and force a new export operation. The default value SHALL be `false` (turned off). + +When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination. + +## Export operation logging + +### RQ.ClickHouse.ExportPartition.Logging +version: 1.0 + +[ClickHouse] SHALL provide detailed logging for export operations by: +* Logging all export operations (both successful and failed) with timestamps and details +* Recording the specific partition ID in the `system.part_log` table for all operations +* Logging export events in the `system.events` table, including: + * `PartsExports` - Number of successful part exports (within partitions) + * `PartsExportFailures` - Number of failed part exports + * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PARTITION` +* Providing sufficient detail for monitoring and troubleshooting export operations +* Logging per-part export status within partition exports + +Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations. + +## Monitoring export operations + +### RQ.ClickHouse.ExportPartition.SystemTables.Exports +version: 1.0 + +[ClickHouse] SHALL provide a `system.replicated_partition_exports` table that allows users to monitor active partition export operations with at least the following columns: +* `source_table` - source table identifier +* `destination_table` - destination table identifier +* `partition_id` - the partition ID being exported +* `status` - current status of the export operation (e.g., PENDING, IN_PROGRESS, COMPLETED, FAILED) +* `parts_total` - total number of parts in the partition +* `parts_processed` - number of parts successfully exported +* `parts_failed` - number of parts that failed to export +* `create_time` - when the export operation was created +* `update_time` - last update time of the export operation + +The table SHALL track export operations before they complete and SHALL show completed or failed exports until they are cleaned up (based on TTL). + +Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster. + +## Enabling export functionality + +### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental +version: 1.0 + +[ClickHouse] SHALL support the `allow_experimental_export_merge_tree_part` setting that SHALL gate the experimental export partition functionality, which SHALL be set to `1` to enable `ALTER TABLE ... EXPORT PARTITION ID ...` commands. The default value SHALL be `0` (turned off). + +This setting allows administrators to control access to experimental functionality and ensures users are aware they are using a feature that may change. + +## Handling file conflicts during export + +### RQ.ClickHouse.ExportPartition.Settings.OverwriteFile +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_part_overwrite_file_if_exists` setting that controls whether to overwrite files if they already exist when exporting a partition. The default value SHALL be `0` (turned off), meaning exports will fail if files already exist in the destination. + +This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed. + +## Export operation configuration + +### RQ.ClickHouse.ExportPartition.ParallelFormatting +version: 1.0 + +[ClickHouse] SHALL support parallel formatting for export operations by: +* Automatically enabling parallel formatting for large export operations to improve performance +* Using the `output_format_parallel_formatting` setting to control parallel formatting behavior +* Optimizing data processing based on export size and system resources +* Providing consistent formatting performance across different export scenarios +* Allowing parallel processing of multiple parts within a partition when possible + +Parallel formatting improves export performance, especially for large partitions with many parts. + +## Controlling export performance + +### RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth +version: 1.0 + +[ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file. + +Administrators need to control export bandwidth to avoid impacting other operations on the server. + +### RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize +version: 1.0 + +[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file. + +This setting allows administrators to balance export performance with other system operations. + +### RQ.ClickHouse.ExportPartition.Metrics.Export +version: 1.0 + +[ClickHouse] SHALL provide the `Export` current metric in `system.metrics` table that tracks the number of currently executing partition exports. + +This metric helps monitor system load from export operations. + +## Export operation security + +### RQ.ClickHouse.ExportPartition.Security.RBAC +version: 1.0 + +[ClickHouse] SHALL enforce role-based access control (RBAC) for export operations. Users must have the following privileges to perform export operations: +* **Source Table**: `SELECT` privilege on the source table to read data parts +* **Destination Table**: `INSERT` privilege on the destination table to write exported data +* **Database Access**: `SHOW` privilege on both source and destination databases +* **System Tables**: `SELECT` privilege on `system.tables` and `system.replicated_partition_exports` to validate table existence and monitor exports + +Export operations move potentially sensitive data, and proper access controls ensure only authorized users can export partitions. + +### RQ.ClickHouse.ExportPartition.Security.DataEncryption +version: 1.0 + +[ClickHouse] SHALL encrypt all data in transit to destination storage using TLS/SSL during export operations. + +Data encryption protects sensitive information from being intercepted or accessed during transmission to destination storage. + +### RQ.ClickHouse.ExportPartition.Security.Network +version: 1.0 + +[ClickHouse] SHALL use secure connections to destination storage during export operations. For S3-compatible storage, connections must use HTTPS. For other storage types, secure protocols appropriate to the storage system must be used. + +Secure network connections prevent unauthorized access and ensure data integrity during export operations. + +### RQ.ClickHouse.ExportPartition.Security.CredentialManagement +version: 1.0 + +[ClickHouse] SHALL use secure credential storage for export operations and SHALL avoid exposing credentials in logs or error messages. + +Proper credential management prevents unauthorized access to destination storage systems and protects sensitive authentication information. + + +[ClickHouse]: https://clickhouse.com + diff --git a/s3/requirements/export_partition.py b/s3/requirements/export_partition.py new file mode 100644 index 000000000..683e6dfb0 --- /dev/null +++ b/s3/requirements/export_partition.py @@ -0,0 +1,1736 @@ +# These requirements were auto generated +# from software requirements specification (SRS) +# document by TestFlows v2.0.250110.1002922. +# Do not edit by hand but re-generate instead +# using 'tfs requirements generate' command. +from testflows.core import Specification +from testflows.core import Requirement + +Heading = Specification.Heading + +RQ_ClickHouse_ExportPartition_S3 = Requirement( + name="RQ.ClickHouse.ExportPartition.S3", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions (all parts within a partition) from ReplicatedMergeTree engine tables to S3 object storage. The export operation SHALL export all parts that belong to the specified partition ID, ensuring complete partition data is transferred to the destination.\n" + "\n" + ), + link=None, + level=2, + num="2.1", +) + +RQ_ClickHouse_ExportPartition_EmptyPartition = Requirement( + name="RQ.ClickHouse.ExportPartition.EmptyPartition", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting from empty partitions by:\n" + "* Completing export operations successfully when the specified partition contains no parts\n" + "* Resulting in an empty destination partition when exporting from an empty source partition\n" + "* Not creating any files in destination storage when there are no parts to export in the partition\n" + "* Handling empty partitions gracefully without errors\n" + "\n" + ), + link=None, + level=2, + num="2.2", +) + +RQ_ClickHouse_ExportPartition_SQLCommand = Requirement( + name="RQ.ClickHouse.ExportPartition.SQLCommand", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the following SQL command syntax for exporting partitions from ReplicatedMergeTree tables to object storage tables:\n" + "\n" + "```sql\n" + "ALTER TABLE [database.]source_table_name \n" + "EXPORT PARTITION ID 'partition_id' \n" + "TO TABLE [database.]destination_table_name\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" + "**Parameters:**\n" + "- `source_table_name`: Name of the source ReplicatedMergeTree table\n" + "- `partition_id`: The partition ID to export (string literal), which identifies all parts belonging to that partition\n" + "- `destination_table_name`: Name of the destination object storage table\n" + "- `allow_experimental_export_merge_tree_part`: Setting that must be set to `1` to enable this experimental feature\n" + "\n" + "This command allows users to export entire partitions in a single operation, which is more efficient than exporting individual parts and ensures all data for a partition is exported together.\n" + "\n" + ), + link=None, + level=2, + num="3.1", +) + +RQ_ClickHouse_ExportPartition_SourceEngines = Requirement( + name="RQ.ClickHouse.ExportPartition.SourceEngines", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from the following source table engines:\n" + "* `ReplicatedMergeTree` - Replicated MergeTree engine (primary use case)\n" + "* `ReplicatedSummingMergeTree` - Replicated MergeTree with automatic summation\n" + "* `ReplicatedAggregatingMergeTree` - Replicated MergeTree with pre-aggregated data\n" + "* `ReplicatedCollapsingMergeTree` - Replicated MergeTree with row versioning\n" + "* `ReplicatedVersionedCollapsingMergeTree` - Replicated CollapsingMergeTree with version tracking\n" + "* `ReplicatedGraphiteMergeTree` - Replicated MergeTree optimized for Graphite data\n" + "* All other ReplicatedMergeTree family engines\n" + "\n" + "Export partition functionality manages export operations across multiple replicas in a cluster, ensuring that parts are exported correctly and avoiding conflicts.\n" + "\n" + ), + link=None, + level=2, + num="4.1", +) + +RQ_ClickHouse_ExportPartition_ClustersNodes = Requirement( + name="RQ.ClickHouse.ExportPartition.ClustersNodes", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from multiple nodes in a ReplicatedMergeTree cluster to the same destination storage, ensuring that:\n" + "* Each replica in the cluster can independently export parts from the partition that it owns locally\n" + "* All parts within a partition are exported exactly once, even when distributed across multiple replicas\n" + "* Exported data from different replicas is correctly aggregated in the destination storage\n" + "* All nodes in the cluster can read the same exported partition data from the destination\n" + "* Export operations continue to make progress even if some replicas are temporarily unavailable\n" + "\n" + "In a replicated cluster, different parts of the same partition may exist on different replicas. The system must coordinate exports across all replicas to ensure complete partition export without duplication.\n" + "\n" + ), + link=None, + level=2, + num="5.1", +) + +RQ_ClickHouse_ExportPartition_SourcePartStorage = Requirement( + name="RQ.ClickHouse.ExportPartition.SourcePartStorage", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions regardless of the underlying storage type where the source parts are stored, including:\n" + "* **Local Disks**: Parts stored on local filesystem\n" + "* **S3/Object Storage**: Parts stored on S3 or S3-compatible object storage\n" + "* **Encrypted Disks**: Parts stored on encrypted disks (disk-level encryption)\n" + "* **Cached Disks**: Parts stored with filesystem cache enabled\n" + "* **Remote Disks**: Parts stored on HDFS, Azure Blob Storage, or Google Cloud Storage\n" + "* **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold)\n" + "* **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled\n" + "\n" + "Users should be able to export partitions regardless of where the source data is physically stored, providing flexibility in storage configurations.\n" + "\n" + ), + link=None, + level=2, + num="6.1", +) + +RQ_ClickHouse_ExportPartition_StoragePolicies = Requirement( + name="RQ.ClickHouse.ExportPartition.StoragePolicies", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including:\n" + "* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks\n" + "* **External Volumes**: Volumes using external storage systems\n" + "* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers\n" + "* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks\n" + "* Exporting all parts in a partition regardless of which volume or disk within the storage policy contains each part\n" + "* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy\n" + "\n" + "Users may have partitions with parts distributed across different storage tiers or volumes, and the export should handle all parts regardless of their storage location.\n" + "\n" + ), + link=None, + level=2, + num="7.1", +) + +RQ_ClickHouse_ExportPartition_DestinationEngines = Requirement( + name="RQ.ClickHouse.ExportPartition.DestinationEngines", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting to destination tables that:\n" + "* Support object storage engines including:\n" + " * `S3` - Amazon S3 and S3-compatible storage\n" + " * `StorageObjectStorage` - Generic object storage interface\n" + " * `HDFS` - Hadoop Distributed File System (with Hive partitioning)\n" + " * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning)\n" + " * `GCS` - Google Cloud Storage (with Hive partitioning)\n" + "\n" + "Export partition is designed to move data from local or replicated storage to object storage systems for long-term storage, analytics, or data sharing purposes.\n" + "\n" + ), + link=None, + level=2, + num="8.1", +) + +RQ_ClickHouse_ExportPartition_SchemaCompatibility = Requirement( + name="RQ.ClickHouse.ExportPartition.SchemaCompatibility", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL require source and destination tables to have compatible schemas for successful export operations:\n" + "* Identical physical column schemas between source and destination\n" + "* The same partition key expression in both tables\n" + "* Compatible data types for all columns\n" + "* Matching column order and names\n" + "\n" + "Schema compatibility ensures that exported data can be correctly read from the destination table without data loss or corruption.\n" + "\n" + ), + link=None, + level=2, + num="9.1", +) + +RQ_ClickHouse_ExportPartition_PartitionKeyTypes = Requirement( + name="RQ.ClickHouse.ExportPartition.PartitionKeyTypes", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support export operations for tables with partition key types that are compatible with Hive partitioning, as shown in the following table:\n" + "\n" + "| Partition Key Type | Supported | Examples | Notes |\n" + "|-------------------------|-----------|--------------------------------------------------------------------------|--------------------------------|\n" + "| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported |\n" + "| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported |\n" + "| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported |\n" + "| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported |\n" + "\n" + "[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements.\n" + "\n" + "[ClickHouse] SHALL require destination tables to support Hive partitioning, which limits the supported partition key types to Integer, Date/DateTime, and String types. Complex expressions that result in unsupported types are not supported for export operations.\n" + "\n" + "Hive partitioning is a standard way to organize data in object storage systems, making exported data compatible with various analytics tools and systems.\n" + "\n" + ), + link=None, + level=2, + num="10.1", +) + +RQ_ClickHouse_ExportPartition_PartitionContent = Requirement( + name="RQ.ClickHouse.ExportPartition.PartitionContent", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support export operations for partitions containing all valid MergeTree part types and their contents, including:\n" + "\n" + "| Part Type | Supported | Description | Special Features |\n" + "|-------------------|-----------|--------------------------------------------------------------|--------------------------------|\n" + "| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts |\n" + "| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts |\n" + "\n" + "[ClickHouse] SHALL export all parts within the specified partition, regardless of their type. The system SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL maintain data integrity in the destination storage.\n" + "\n" + "Partitions may contain a mix of different part types, and the export must handle all of them correctly to ensure complete partition export.\n" + "\n" + ), + link=None, + level=2, + num="11.1", +) + +RQ_ClickHouse_ExportPartition_SchemaChangeIsolation = Requirement( + name="RQ.ClickHouse.ExportPartition.SchemaChangeIsolation", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL ensure exported partition data is isolated from subsequent schema changes by:\n" + "* Preserving exported data exactly as it was at the time of export\n" + "* Not being affected by schema changes (column drops, renames, type changes) that occur after export\n" + "* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export\n" + "* Ensuring exported data reflects the source table state at the time of export, not the current state\n" + "\n" + "Once a partition is exported, the exported data should remain stable and not be affected by future changes to the source table schema.\n" + "\n" + ), + link=None, + level=2, + num="11.2", +) + +RQ_ClickHouse_ExportPartition_LargePartitions = Requirement( + name="RQ.ClickHouse.ExportPartition.LargePartitions", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting large partitions by:\n" + "* Handling partitions with large numbers of parts (e.g., hundreds or thousands of parts)\n" + "* Processing partitions with large numbers of rows (e.g., billions of rows)\n" + "* Processing large data volumes efficiently during export\n" + "* Maintaining data integrity when exporting large partitions\n" + "* Completing export operations successfully regardless of partition size\n" + "* Allowing export operations to continue over extended periods of time for very large partitions\n" + "\n" + "Production systems often have partitions containing very large amounts of data, and the export must handle these efficiently without timeouts or memory issues.\n" + "\n" + ), + link=None, + level=2, + num="11.3", +) + +RQ_ClickHouse_ExportPartition_RetryMechanism = Requirement( + name="RQ.ClickHouse.ExportPartition.RetryMechanism", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL automatically retry failed part exports within a partition up to a configurable maximum retry count. If all retry attempts are exhausted for a part, the entire partition export operation SHALL be marked as failed.\n" + "\n" + "Unlike single-part exports, partition exports involve multiple parts and may take significant time. Retry mechanisms ensure that temporary failures don't require restarting the entire export operation.\n" + "\n" + ), + link=None, + level=2, + num="12.1", +) + +RQ_ClickHouse_ExportPartition_Settings_MaxRetries = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.MaxRetries", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `export_merge_tree_partition_max_retries` setting that controls the maximum number of retries for exporting a merge tree part in an export partition task. The default value SHALL be `3`.\n" + "\n" + "This setting allows users to control how many times the system will retry exporting a part before marking it as failed.\n" + "\n" + ), + link=None, + level=2, + num="12.2", +) + +RQ_ClickHouse_ExportPartition_ResumeAfterFailure = Requirement( + name="RQ.ClickHouse.ExportPartition.ResumeAfterFailure", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL allow export operations to resume after node failures or restarts. The system SHALL track which parts have been successfully exported and SHALL not re-export parts that were already successfully exported.\n" + "\n" + ), + link=None, + level=2, + num="12.3", +) + +RQ_ClickHouse_ExportPartition_PartialProgress = Requirement( + name="RQ.ClickHouse.ExportPartition.PartialProgress", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed.\n" + "\n" + ), + link=None, + level=2, + num="12.4", +) + +RQ_ClickHouse_ExportPartition_Cleanup = Requirement( + name="RQ.ClickHouse.ExportPartition.Cleanup", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL automatically clean up failed or completed export operations after a configurable TTL period.\n" + "\n" + ), + link=None, + level=2, + num="12.5", +) + +RQ_ClickHouse_ExportPartition_Settings_ManifestTTL = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.ManifestTTL", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `export_merge_tree_partition_manifest_ttl` setting that determines how long the export manifest will be retained. This setting prevents the same partition from being exported twice to the same destination within the TTL period. The default value SHALL be `180` seconds.\n" + "\n" + "This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports.\n" + "\n" + ), + link=None, + level=2, + num="12.6", +) + +RQ_ClickHouse_ExportPartition_NetworkResilience_PacketIssues = Requirement( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle network packet issues during export operations by:\n" + "* Tolerating packet delay without data corruption or loss\n" + "* Handling packet loss and retransmitting data as needed\n" + "* Detecting and handling packet corruption to ensure data integrity\n" + "* Managing packet duplication without data duplication in destination\n" + "* Handling packet reordering to maintain correct data sequence\n" + "* Operating correctly under packet rate limiting constraints\n" + "* Completing exports successfully despite network impairments\n" + "\n" + "Network issues are common in distributed systems, and export operations must be resilient to ensure data integrity.\n" + "\n" + ), + link=None, + level=2, + num="13.1", +) + +RQ_ClickHouse_ExportPartition_NetworkResilience_DestinationInterruption = Requirement( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle destination storage interruptions during export operations by:\n" + "* Detecting when destination storage becomes unavailable during export\n" + "* Retrying failed part exports when destination storage becomes available again\n" + "* Logging failed exports in the `system.events` table with appropriate counters\n" + "* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability\n" + "* Allowing exports to complete successfully once destination storage becomes available again\n" + "* Resuming export operations from the last successfully exported part\n" + "\n" + "Destination storage systems may experience temporary outages, and the export should automatically recover when service is restored.\n" + "\n" + ), + link=None, + level=2, + num="13.2", +) + +RQ_ClickHouse_ExportPartition_NetworkResilience_NodeInterruption = Requirement( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by:\n" + "* Allowing export operations to resume after node restart without data loss or duplication\n" + "* Allowing other replicas to continue or complete export operations if a node fails\n" + "* Not leaving partial or corrupted data in destination storage when node restarts occur\n" + "* With safe shutdown, ensuring exports complete successfully before node shutdown when possible\n" + "* With unsafe shutdown, allowing export operations to resume from the last checkpoint after node restart\n" + "* Maintaining data integrity in destination storage regardless of node interruption type\n" + "* Ensuring that parts already exported are not re-exported after node restart\n" + "\n" + "Node failures are common in distributed systems, and export operations must be able to recover and continue without data loss or duplication.\n" + "\n" + ), + link=None, + level=2, + num="13.3", +) + +RQ_ClickHouse_ExportPartition_Restrictions_SameTable = Requirement( + name="RQ.ClickHouse.ExportPartition.Restrictions.SameTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prevent exporting partitions to the same table as the source by:\n" + "* Validating that source and destination table identifiers are different\n" + '* Throwing a `BAD_ARGUMENTS` exception with message "Exporting to the same table is not allowed" when source and destination are identical\n' + "* Performing this validation before any export processing begins\n" + "\n" + "Exporting to the same table would be redundant and could cause data duplication or conflicts.\n" + "\n" + ), + link=None, + level=3, + num="14.1.1", +) + +RQ_ClickHouse_ExportPartition_Restrictions_DestinationSupport = Requirement( + name="RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate destination table compatibility by:\n" + "\n" + "* Checking that the destination storage supports importing MergeTree parts\n" + "* Verifying that the destination uses Hive partitioning strategy (`partition_strategy = 'hive'`)\n" + '* Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + "The destination must support the format and partitioning strategy required for exported data.\n" + "\n" + ), + link=None, + level=3, + num="14.2.1", +) + +RQ_ClickHouse_ExportPartition_Restrictions_LocalTable = Requirement( + name="RQ.ClickHouse.ExportPartition.Restrictions.LocalTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prevent exporting partitions to local MergeTree tables by:\n" + "* Rejecting export operations where the destination table uses a MergeTree engine\n" + '* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + "Export partition is designed to move data to object storage, not to local MergeTree tables.\n" + "\n" + ), + link=None, + level=3, + num="14.3.1", +) + +RQ_ClickHouse_ExportPartition_Restrictions_PartitionKey = Requirement( + name="RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by:\n" + "* Checking that the partition key expression matches between source and destination tables\n" + '* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ\n' + "* Performing this validation during the initial export setup phase\n" + "\n" + "Matching partition keys ensure that exported data is organized correctly in the destination storage.\n" + "\n" + ), + link=None, + level=3, + num="14.4.1", +) + +RQ_ClickHouse_ExportPartition_Restrictions_SourcePartition = Requirement( + name="RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL validate source partition availability by:\n" + "* Checking that the specified partition ID exists in the source table\n" + "* Verifying that the partition contains at least one active part (not detached or missing)\n" + "* Throwing an exception with an appropriate error message when the partition is not found or is empty\n" + "* Performing this validation before any export processing begins\n" + "\n" + "The system must verify that the partition exists and contains data before attempting to export it.\n" + "\n" + ), + link=None, + level=3, + num="14.5.1", +) + +RQ_ClickHouse_ExportPartition_Concurrency = Requirement( + name="RQ.ClickHouse.ExportPartition.Concurrency", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support concurrent export operations by:\n" + "* Allowing multiple partition exports to run simultaneously without interference\n" + "* Supporting concurrent exports of different partitions to different destinations\n" + "* Preventing concurrent exports of the same partition to the same destination\n" + "* Allowing different replicas to export different parts of the same partition concurrently\n" + "* Maintaining separate progress tracking for each concurrent operation\n" + "\n" + "Multiple users may want to export different partitions simultaneously, and the system must coordinate these operations to prevent conflicts while maximizing parallelism.\n" + "\n" + ), + link=None, + level=2, + num="15.1", +) + +RQ_ClickHouse_ExportPartition_Idempotency = Requirement( + name="RQ.ClickHouse.ExportPartition.Idempotency", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL handle duplicate export operations by:\n" + "* Preventing duplicate data from being exported when the same partition is exported multiple times to the same destination\n" + "* Detecting when a partition export is already in progress or completed\n" + "* Detecting when an export operation attempts to export a partition that already exists in the destination\n" + "* Logging duplicate export attempts in the `system.events` table with appropriate counters\n" + "* Ensuring that destination data matches source data without duplication when the same partition is exported multiple times\n" + "* Allowing users to force re-export of a partition if needed (e.g., after TTL expiration or manual cleanup)\n" + "\n" + "Users may accidentally trigger the same export multiple times, and the system should prevent duplicate data while allowing legitimate re-exports when needed.\n" + "\n" + ), + link=None, + level=2, + num="16.1", +) + +RQ_ClickHouse_ExportPartition_Settings_ForceExport = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.ForceExport", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `export_merge_tree_partition_force_export` setting that allows users to ignore existing partition export entries and force a new export operation. The default value SHALL be `false` (turned off).\n" + "\n" + "When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination.\n" + "\n" + ), + link=None, + level=2, + num="16.2", +) + +RQ_ClickHouse_ExportPartition_Logging = Requirement( + name="RQ.ClickHouse.ExportPartition.Logging", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide detailed logging for export operations by:\n" + "* Logging all export operations (both successful and failed) with timestamps and details\n" + "* Recording the specific partition ID in the `system.part_log` table for all operations\n" + "* Logging export events in the `system.events` table, including:\n" + " * `PartsExports` - Number of successful part exports (within partitions)\n" + " * `PartsExportFailures` - Number of failed part exports\n" + " * `PartsExportDuplicated` - Number of part exports that failed because target already exists\n" + "* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PARTITION`\n" + "* Providing sufficient detail for monitoring and troubleshooting export operations\n" + "* Logging per-part export status within partition exports\n" + "\n" + "Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations.\n" + "\n" + ), + link=None, + level=2, + num="17.1", +) + +RQ_ClickHouse_ExportPartition_SystemTables_Exports = Requirement( + name="RQ.ClickHouse.ExportPartition.SystemTables.Exports", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide a `system.replicated_partition_exports` table that allows users to monitor active partition export operations with at least the following columns:\n" + "* `source_table` - source table identifier\n" + "* `destination_table` - destination table identifier\n" + "* `partition_id` - the partition ID being exported\n" + "* `status` - current status of the export operation (e.g., PENDING, IN_PROGRESS, COMPLETED, FAILED)\n" + "* `parts_total` - total number of parts in the partition\n" + "* `parts_processed` - number of parts successfully exported\n" + "* `parts_failed` - number of parts that failed to export\n" + "* `create_time` - when the export operation was created\n" + "* `update_time` - last update time of the export operation\n" + "\n" + "The table SHALL track export operations before they complete and SHALL show completed or failed exports until they are cleaned up (based on TTL).\n" + "\n" + "Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster.\n" + "\n" + ), + link=None, + level=2, + num="18.1", +) + +RQ_ClickHouse_ExportPartition_Settings_AllowExperimental = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.AllowExperimental", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `allow_experimental_export_merge_tree_part` setting that SHALL gate the experimental export partition functionality, which SHALL be set to `1` to enable `ALTER TABLE ... EXPORT PARTITION ID ...` commands. The default value SHALL be `0` (turned off).\n" + "\n" + "This setting allows administrators to control access to experimental functionality and ensures users are aware they are using a feature that may change.\n" + "\n" + ), + link=None, + level=2, + num="19.1", +) + +RQ_ClickHouse_ExportPartition_Settings_OverwriteFile = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.OverwriteFile", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `export_merge_tree_part_overwrite_file_if_exists` setting that controls whether to overwrite files if they already exist when exporting a partition. The default value SHALL be `0` (turned off), meaning exports will fail if files already exist in the destination.\n" + "\n" + "This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed.\n" + "\n" + ), + link=None, + level=2, + num="20.1", +) + +RQ_ClickHouse_ExportPartition_ParallelFormatting = Requirement( + name="RQ.ClickHouse.ExportPartition.ParallelFormatting", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support parallel formatting for export operations by:\n" + "* Automatically enabling parallel formatting for large export operations to improve performance\n" + "* Using the `output_format_parallel_formatting` setting to control parallel formatting behavior\n" + "* Optimizing data processing based on export size and system resources\n" + "* Providing consistent formatting performance across different export scenarios\n" + "* Allowing parallel processing of multiple parts within a partition when possible\n" + "\n" + "Parallel formatting improves export performance, especially for large partitions with many parts.\n" + "\n" + ), + link=None, + level=2, + num="21.1", +) + +RQ_ClickHouse_ExportPartition_ServerSettings_MaxBandwidth = Requirement( + name="RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file.\n" + "\n" + "Administrators need to control export bandwidth to avoid impacting other operations on the server.\n" + "\n" + ), + link=None, + level=2, + num="22.1", +) + +RQ_ClickHouse_ExportPartition_ServerSettings_BackgroundMovePoolSize = Requirement( + name="RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file.\n" + "\n" + "This setting allows administrators to balance export performance with other system operations.\n" + "\n" + ), + link=None, + level=2, + num="22.2", +) + +RQ_ClickHouse_ExportPartition_Metrics_Export = Requirement( + name="RQ.ClickHouse.ExportPartition.Metrics.Export", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL provide the `Export` current metric in `system.metrics` table that tracks the number of currently executing partition exports.\n" + "\n" + "This metric helps monitor system load from export operations.\n" + "\n" + ), + link=None, + level=2, + num="22.3", +) + +RQ_ClickHouse_ExportPartition_Security_RBAC = Requirement( + name="RQ.ClickHouse.ExportPartition.Security.RBAC", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL enforce role-based access control (RBAC) for export operations. Users must have the following privileges to perform export operations:\n" + "* **Source Table**: `SELECT` privilege on the source table to read data parts\n" + "* **Destination Table**: `INSERT` privilege on the destination table to write exported data\n" + "* **Database Access**: `SHOW` privilege on both source and destination databases\n" + "* **System Tables**: `SELECT` privilege on `system.tables` and `system.replicated_partition_exports` to validate table existence and monitor exports\n" + "\n" + "Export operations move potentially sensitive data, and proper access controls ensure only authorized users can export partitions.\n" + "\n" + ), + link=None, + level=2, + num="23.1", +) + +RQ_ClickHouse_ExportPartition_Security_DataEncryption = Requirement( + name="RQ.ClickHouse.ExportPartition.Security.DataEncryption", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL encrypt all data in transit to destination storage using TLS/SSL during export operations.\n" + "\n" + "Data encryption protects sensitive information from being intercepted or accessed during transmission to destination storage.\n" + "\n" + ), + link=None, + level=2, + num="23.2", +) + +RQ_ClickHouse_ExportPartition_Security_Network = Requirement( + name="RQ.ClickHouse.ExportPartition.Security.Network", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL use secure connections to destination storage during export operations. For S3-compatible storage, connections must use HTTPS. For other storage types, secure protocols appropriate to the storage system must be used.\n" + "\n" + "Secure network connections prevent unauthorized access and ensure data integrity during export operations.\n" + "\n" + ), + link=None, + level=2, + num="23.3", +) + +RQ_ClickHouse_ExportPartition_Security_CredentialManagement = Requirement( + name="RQ.ClickHouse.ExportPartition.Security.CredentialManagement", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL use secure credential storage for export operations and SHALL avoid exposing credentials in logs or error messages.\n" + "\n" + "Proper credential management prevents unauthorized access to destination storage systems and protects sensitive authentication information.\n" + "\n" + "\n" + "[ClickHouse]: https://clickhouse.com\n" + "\n" + ), + link=None, + level=2, + num="23.4", +) + +SRS_016_ClickHouse_Export_Partition_to_S3 = Specification( + name="SRS-016 ClickHouse Export Partition to S3", + description=None, + author=None, + date=None, + status=None, + approved_by=None, + approved_date=None, + approved_version=None, + version=None, + group=None, + type=None, + link=None, + uid=None, + parent=None, + children=None, + headings=( + Heading(name="Introduction", level=1, num="1"), + Heading(name="Exporting Partitions to S3", level=1, num="2"), + Heading(name="RQ.ClickHouse.ExportPartition.S3", level=2, num="2.1"), + Heading( + name="RQ.ClickHouse.ExportPartition.EmptyPartition", level=2, num="2.2" + ), + Heading(name="SQL command support", level=1, num="3"), + Heading(name="RQ.ClickHouse.ExportPartition.SQLCommand", level=2, num="3.1"), + Heading(name="Supported source table engines", level=1, num="4"), + Heading(name="RQ.ClickHouse.ExportPartition.SourceEngines", level=2, num="4.1"), + Heading(name="Cluster and node support", level=1, num="5"), + Heading(name="RQ.ClickHouse.ExportPartition.ClustersNodes", level=2, num="5.1"), + Heading(name="Supported source part storage types", level=1, num="6"), + Heading( + name="RQ.ClickHouse.ExportPartition.SourcePartStorage", level=2, num="6.1" + ), + Heading(name="Storage policies and volumes", level=1, num="7"), + Heading( + name="RQ.ClickHouse.ExportPartition.StoragePolicies", level=2, num="7.1" + ), + Heading(name="Supported destination table engines", level=1, num="8"), + Heading( + name="RQ.ClickHouse.ExportPartition.DestinationEngines", level=2, num="8.1" + ), + Heading(name="Schema compatibility", level=1, num="9"), + Heading( + name="RQ.ClickHouse.ExportPartition.SchemaCompatibility", level=2, num="9.1" + ), + Heading(name="Partition key types support", level=1, num="10"), + Heading( + name="RQ.ClickHouse.ExportPartition.PartitionKeyTypes", level=2, num="10.1" + ), + Heading(name="Partition content support", level=1, num="11"), + Heading( + name="RQ.ClickHouse.ExportPartition.PartitionContent", level=2, num="11.1" + ), + Heading( + name="RQ.ClickHouse.ExportPartition.SchemaChangeIsolation", + level=2, + num="11.2", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.LargePartitions", level=2, num="11.3" + ), + Heading(name="Export operation failure handling", level=1, num="12"), + Heading( + name="RQ.ClickHouse.ExportPartition.RetryMechanism", level=2, num="12.1" + ), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.MaxRetries", + level=2, + num="12.2", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.ResumeAfterFailure", level=2, num="12.3" + ), + Heading( + name="RQ.ClickHouse.ExportPartition.PartialProgress", level=2, num="12.4" + ), + Heading(name="RQ.ClickHouse.ExportPartition.Cleanup", level=2, num="12.5"), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.ManifestTTL", + level=2, + num="12.6", + ), + Heading(name="Network resilience", level=1, num="13"), + Heading( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues", + level=2, + num="13.1", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption", + level=2, + num="13.2", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption", + level=2, + num="13.3", + ), + Heading(name="Export operation restrictions", level=1, num="14"), + Heading(name="Preventing same table exports", level=2, num="14.1"), + Heading( + name="RQ.ClickHouse.ExportPartition.Restrictions.SameTable", + level=3, + num="14.1.1", + ), + Heading(name="Destination table compatibility", level=2, num="14.2"), + Heading( + name="RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport", + level=3, + num="14.2.1", + ), + Heading(name="Local table restriction", level=2, num="14.3"), + Heading( + name="RQ.ClickHouse.ExportPartition.Restrictions.LocalTable", + level=3, + num="14.3.1", + ), + Heading(name="Partition key compatibility", level=2, num="14.4"), + Heading( + name="RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey", + level=3, + num="14.4.1", + ), + Heading(name="Source partition availability", level=2, num="14.5"), + Heading( + name="RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition", + level=3, + num="14.5.1", + ), + Heading(name="Export operation concurrency", level=1, num="15"), + Heading(name="RQ.ClickHouse.ExportPartition.Concurrency", level=2, num="15.1"), + Heading(name="Export operation idempotency", level=1, num="16"), + Heading(name="RQ.ClickHouse.ExportPartition.Idempotency", level=2, num="16.1"), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.ForceExport", + level=2, + num="16.2", + ), + Heading(name="Export operation logging", level=1, num="17"), + Heading(name="RQ.ClickHouse.ExportPartition.Logging", level=2, num="17.1"), + Heading(name="Monitoring export operations", level=1, num="18"), + Heading( + name="RQ.ClickHouse.ExportPartition.SystemTables.Exports", + level=2, + num="18.1", + ), + Heading(name="Enabling export functionality", level=1, num="19"), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.AllowExperimental", + level=2, + num="19.1", + ), + Heading(name="Handling file conflicts during export", level=1, num="20"), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.OverwriteFile", + level=2, + num="20.1", + ), + Heading(name="Export operation configuration", level=1, num="21"), + Heading( + name="RQ.ClickHouse.ExportPartition.ParallelFormatting", level=2, num="21.1" + ), + Heading(name="Controlling export performance", level=1, num="22"), + Heading( + name="RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth", + level=2, + num="22.1", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize", + level=2, + num="22.2", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.Metrics.Export", level=2, num="22.3" + ), + Heading(name="Export operation security", level=1, num="23"), + Heading( + name="RQ.ClickHouse.ExportPartition.Security.RBAC", level=2, num="23.1" + ), + Heading( + name="RQ.ClickHouse.ExportPartition.Security.DataEncryption", + level=2, + num="23.2", + ), + Heading( + name="RQ.ClickHouse.ExportPartition.Security.Network", level=2, num="23.3" + ), + Heading( + name="RQ.ClickHouse.ExportPartition.Security.CredentialManagement", + level=2, + num="23.4", + ), + ), + requirements=( + RQ_ClickHouse_ExportPartition_S3, + RQ_ClickHouse_ExportPartition_EmptyPartition, + RQ_ClickHouse_ExportPartition_SQLCommand, + RQ_ClickHouse_ExportPartition_SourceEngines, + RQ_ClickHouse_ExportPartition_ClustersNodes, + RQ_ClickHouse_ExportPartition_SourcePartStorage, + RQ_ClickHouse_ExportPartition_StoragePolicies, + RQ_ClickHouse_ExportPartition_DestinationEngines, + RQ_ClickHouse_ExportPartition_SchemaCompatibility, + RQ_ClickHouse_ExportPartition_PartitionKeyTypes, + RQ_ClickHouse_ExportPartition_PartitionContent, + RQ_ClickHouse_ExportPartition_SchemaChangeIsolation, + RQ_ClickHouse_ExportPartition_LargePartitions, + RQ_ClickHouse_ExportPartition_RetryMechanism, + RQ_ClickHouse_ExportPartition_Settings_MaxRetries, + RQ_ClickHouse_ExportPartition_ResumeAfterFailure, + RQ_ClickHouse_ExportPartition_PartialProgress, + RQ_ClickHouse_ExportPartition_Cleanup, + RQ_ClickHouse_ExportPartition_Settings_ManifestTTL, + RQ_ClickHouse_ExportPartition_NetworkResilience_PacketIssues, + RQ_ClickHouse_ExportPartition_NetworkResilience_DestinationInterruption, + RQ_ClickHouse_ExportPartition_NetworkResilience_NodeInterruption, + RQ_ClickHouse_ExportPartition_Restrictions_SameTable, + RQ_ClickHouse_ExportPartition_Restrictions_DestinationSupport, + RQ_ClickHouse_ExportPartition_Restrictions_LocalTable, + RQ_ClickHouse_ExportPartition_Restrictions_PartitionKey, + RQ_ClickHouse_ExportPartition_Restrictions_SourcePartition, + RQ_ClickHouse_ExportPartition_Concurrency, + RQ_ClickHouse_ExportPartition_Idempotency, + RQ_ClickHouse_ExportPartition_Settings_ForceExport, + RQ_ClickHouse_ExportPartition_Logging, + RQ_ClickHouse_ExportPartition_SystemTables_Exports, + RQ_ClickHouse_ExportPartition_Settings_AllowExperimental, + RQ_ClickHouse_ExportPartition_Settings_OverwriteFile, + RQ_ClickHouse_ExportPartition_ParallelFormatting, + RQ_ClickHouse_ExportPartition_ServerSettings_MaxBandwidth, + RQ_ClickHouse_ExportPartition_ServerSettings_BackgroundMovePoolSize, + RQ_ClickHouse_ExportPartition_Metrics_Export, + RQ_ClickHouse_ExportPartition_Security_RBAC, + RQ_ClickHouse_ExportPartition_Security_DataEncryption, + RQ_ClickHouse_ExportPartition_Security_Network, + RQ_ClickHouse_ExportPartition_Security_CredentialManagement, + ), + content=r""" +# SRS-016 ClickHouse Export Partition to S3 +# Software Requirements Specification + +## Table of Contents + +* 1 [Introduction](#introduction) +* 2 [Exporting Partitions to S3](#exporting-partitions-to-s3) + * 2.1 [RQ.ClickHouse.ExportPartition.S3](#rqclickhouseexportpartitions3) + * 2.2 [RQ.ClickHouse.ExportPartition.EmptyPartition](#rqclickhouseexportpartitionemptypartition) +* 3 [SQL command support](#sql-command-support) + * 3.1 [RQ.ClickHouse.ExportPartition.SQLCommand](#rqclickhouseexportpartitionsqlcommand) +* 4 [Supported source table engines](#supported-source-table-engines) + * 4.1 [RQ.ClickHouse.ExportPartition.SourceEngines](#rqclickhouseexportpartitionsourceengines) +* 5 [Cluster and node support](#cluster-and-node-support) + * 5.1 [RQ.ClickHouse.ExportPartition.ClustersNodes](#rqclickhouseexportpartitionclustersnodes) +* 6 [Supported source part storage types](#supported-source-part-storage-types) + * 6.1 [RQ.ClickHouse.ExportPartition.SourcePartStorage](#rqclickhouseexportpartitionsourcepartstorage) +* 7 [Storage policies and volumes](#storage-policies-and-volumes) + * 7.1 [RQ.ClickHouse.ExportPartition.StoragePolicies](#rqclickhouseexportpartitionstoragepolicies) +* 8 [Supported destination table engines](#supported-destination-table-engines) + * 8.1 [RQ.ClickHouse.ExportPartition.DestinationEngines](#rqclickhouseexportpartitiondestinationengines) +* 9 [Schema compatibility](#schema-compatibility) + * 9.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) +* 10 [Partition key types support](#partition-key-types-support) + * 10.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) +* 11 [Partition content support](#partition-content-support) + * 11.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) + * 11.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) + * 11.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) +* 12 [Export operation failure handling](#export-operation-failure-handling) + * 12.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) + * 12.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) + * 12.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) + * 12.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) + * 12.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) + * 12.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) +* 13 [Network resilience](#network-resilience) + * 13.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) + * 13.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) + * 13.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) +* 14 [Export operation restrictions](#export-operation-restrictions) + * 14.1 [Preventing same table exports](#preventing-same-table-exports) + * 14.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) + * 14.2 [Destination table compatibility](#destination-table-compatibility) + * 14.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) + * 14.3 [Local table restriction](#local-table-restriction) + * 14.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) + * 14.4 [Partition key compatibility](#partition-key-compatibility) + * 14.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) + * 14.5 [Source partition availability](#source-partition-availability) + * 14.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) +* 15 [Export operation concurrency](#export-operation-concurrency) + * 15.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) +* 16 [Export operation idempotency](#export-operation-idempotency) + * 16.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) + * 16.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) +* 17 [Export operation logging](#export-operation-logging) + * 17.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) +* 18 [Monitoring export operations](#monitoring-export-operations) + * 18.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) +* 19 [Enabling export functionality](#enabling-export-functionality) + * 19.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) +* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 20.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) +* 21 [Export operation configuration](#export-operation-configuration) + * 21.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) +* 22 [Controlling export performance](#controlling-export-performance) + * 22.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) + * 22.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) + * 22.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) +* 23 [Export operation security](#export-operation-security) + * 23.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) + * 23.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) + * 23.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) + * 23.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) + +## Introduction + +This specification defines requirements for exporting partitions (all parts within a partition) from ReplicatedMergeTree tables to S3-compatible object storage. This feature enables users to export entire partitions containing multiple data parts across cluster nodes. + +## Exporting Partitions to S3 + +### RQ.ClickHouse.ExportPartition.S3 +version: 1.0 + +[ClickHouse] SHALL support exporting partitions (all parts within a partition) from ReplicatedMergeTree engine tables to S3 object storage. The export operation SHALL export all parts that belong to the specified partition ID, ensuring complete partition data is transferred to the destination. + +### RQ.ClickHouse.ExportPartition.EmptyPartition +version: 1.0 + +[ClickHouse] SHALL support exporting from empty partitions by: +* Completing export operations successfully when the specified partition contains no parts +* Resulting in an empty destination partition when exporting from an empty source partition +* Not creating any files in destination storage when there are no parts to export in the partition +* Handling empty partitions gracefully without errors + +## SQL command support + +### RQ.ClickHouse.ExportPartition.SQLCommand +version: 1.0 + +[ClickHouse] SHALL support the following SQL command syntax for exporting partitions from ReplicatedMergeTree tables to object storage tables: + +```sql +ALTER TABLE [database.]source_table_name +EXPORT PARTITION ID 'partition_id' +TO TABLE [database.]destination_table_name +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +**Parameters:** +- `source_table_name`: Name of the source ReplicatedMergeTree table +- `partition_id`: The partition ID to export (string literal), which identifies all parts belonging to that partition +- `destination_table_name`: Name of the destination object storage table +- `allow_experimental_export_merge_tree_part`: Setting that must be set to `1` to enable this experimental feature + +This command allows users to export entire partitions in a single operation, which is more efficient than exporting individual parts and ensures all data for a partition is exported together. + +## Supported source table engines + +### RQ.ClickHouse.ExportPartition.SourceEngines +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from the following source table engines: +* `ReplicatedMergeTree` - Replicated MergeTree engine (primary use case) +* `ReplicatedSummingMergeTree` - Replicated MergeTree with automatic summation +* `ReplicatedAggregatingMergeTree` - Replicated MergeTree with pre-aggregated data +* `ReplicatedCollapsingMergeTree` - Replicated MergeTree with row versioning +* `ReplicatedVersionedCollapsingMergeTree` - Replicated CollapsingMergeTree with version tracking +* `ReplicatedGraphiteMergeTree` - Replicated MergeTree optimized for Graphite data +* All other ReplicatedMergeTree family engines + +Export partition functionality manages export operations across multiple replicas in a cluster, ensuring that parts are exported correctly and avoiding conflicts. + +## Cluster and node support + +### RQ.ClickHouse.ExportPartition.ClustersNodes +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from multiple nodes in a ReplicatedMergeTree cluster to the same destination storage, ensuring that: +* Each replica in the cluster can independently export parts from the partition that it owns locally +* All parts within a partition are exported exactly once, even when distributed across multiple replicas +* Exported data from different replicas is correctly aggregated in the destination storage +* All nodes in the cluster can read the same exported partition data from the destination +* Export operations continue to make progress even if some replicas are temporarily unavailable + +In a replicated cluster, different parts of the same partition may exist on different replicas. The system must coordinate exports across all replicas to ensure complete partition export without duplication. + +## Supported source part storage types + +### RQ.ClickHouse.ExportPartition.SourcePartStorage +version: 1.0 + +[ClickHouse] SHALL support exporting partitions regardless of the underlying storage type where the source parts are stored, including: +* **Local Disks**: Parts stored on local filesystem +* **S3/Object Storage**: Parts stored on S3 or S3-compatible object storage +* **Encrypted Disks**: Parts stored on encrypted disks (disk-level encryption) +* **Cached Disks**: Parts stored with filesystem cache enabled +* **Remote Disks**: Parts stored on HDFS, Azure Blob Storage, or Google Cloud Storage +* **Tiered Storage**: Parts stored across multiple storage tiers (hot/cold) +* **Zero-Copy Replication Disks**: Parts stored with zero-copy replication enabled + +Users should be able to export partitions regardless of where the source data is physically stored, providing flexibility in storage configurations. + +## Storage policies and volumes + +### RQ.ClickHouse.ExportPartition.StoragePolicies +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from tables using different storage policies, where storage policies are composed of volumes which are composed of disks, including: +* **JBOD Volumes**: Just a Bunch Of Disks volumes with multiple disks +* **External Volumes**: Volumes using external storage systems +* **Tiered Storage Policies**: Storage policies with multiple volumes for hot/cold data tiers +* **Custom Storage Policies**: Any storage policy configuration composed of volumes and disks +* Exporting all parts in a partition regardless of which volume or disk within the storage policy contains each part +* Maintaining data integrity when exporting from parts stored on any volume or disk in the storage policy + +Users may have partitions with parts distributed across different storage tiers or volumes, and the export should handle all parts regardless of their storage location. + +## Supported destination table engines + +### RQ.ClickHouse.ExportPartition.DestinationEngines +version: 1.0 + +[ClickHouse] SHALL support exporting to destination tables that: +* Support object storage engines including: + * `S3` - Amazon S3 and S3-compatible storage + * `StorageObjectStorage` - Generic object storage interface + * `HDFS` - Hadoop Distributed File System (with Hive partitioning) + * `Azure` - Microsoft Azure Blob Storage (with Hive partitioning) + * `GCS` - Google Cloud Storage (with Hive partitioning) + +Export partition is designed to move data from local or replicated storage to object storage systems for long-term storage, analytics, or data sharing purposes. + +## Schema compatibility + +### RQ.ClickHouse.ExportPartition.SchemaCompatibility +version: 1.0 + +[ClickHouse] SHALL require source and destination tables to have compatible schemas for successful export operations: +* Identical physical column schemas between source and destination +* The same partition key expression in both tables +* Compatible data types for all columns +* Matching column order and names + +Schema compatibility ensures that exported data can be correctly read from the destination table without data loss or corruption. + +## Partition key types support + +### RQ.ClickHouse.ExportPartition.PartitionKeyTypes +version: 1.0 + +[ClickHouse] SHALL support export operations for tables with partition key types that are compatible with Hive partitioning, as shown in the following table: + +| Partition Key Type | Supported | Examples | Notes | +|-------------------------|-----------|--------------------------------------------------------------------------|--------------------------------| +| **Integer Types** | ✅ Yes | `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64` | All integer types supported | +| **Date/DateTime Types** | ✅ Yes | `Date`, `Date32`, `DateTime`, `DateTime64` | All date/time types supported | +| **String Types** | ✅ Yes | `String`, `FixedString` | All string types supported | +| **No Partition Key** | ✅ Yes | Tables without `PARTITION BY` clause | Unpartitioned tables supported | + +[ClickHouse] SHALL automatically extract partition values from source parts and use them to create proper Hive partitioning structure in destination storage, but only for partition key types that are compatible with Hive partitioning requirements. + +[ClickHouse] SHALL require destination tables to support Hive partitioning, which limits the supported partition key types to Integer, Date/DateTime, and String types. Complex expressions that result in unsupported types are not supported for export operations. + +Hive partitioning is a standard way to organize data in object storage systems, making exported data compatible with various analytics tools and systems. + +## Partition content support + +### RQ.ClickHouse.ExportPartition.PartitionContent +version: 1.0 + +[ClickHouse] SHALL support export operations for partitions containing all valid MergeTree part types and their contents, including: + +| Part Type | Supported | Description | Special Features | +|-------------------|-----------|--------------------------------------------------------------|--------------------------------| +| **Wide Parts** | ✅ Yes | Data of each column stored in separate files with marks | Standard format for most parts | +| **Compact Parts** | ✅ Yes | All column data stored in single file with single marks file | Optimized for small parts | + +[ClickHouse] SHALL export all parts within the specified partition, regardless of their type. The system SHALL automatically apply lightweight delete masks during export to ensure only non-deleted rows are exported, and SHALL maintain data integrity in the destination storage. + +Partitions may contain a mix of different part types, and the export must handle all of them correctly to ensure complete partition export. + +### RQ.ClickHouse.ExportPartition.SchemaChangeIsolation +version: 1.0 + +[ClickHouse] SHALL ensure exported partition data is isolated from subsequent schema changes by: +* Preserving exported data exactly as it was at the time of export +* Not being affected by schema changes (column drops, renames, type changes) that occur after export +* Maintaining data integrity in destination storage regardless of mutations applied to the source table after export +* Ensuring exported data reflects the source table state at the time of export, not the current state + +Once a partition is exported, the exported data should remain stable and not be affected by future changes to the source table schema. + +### RQ.ClickHouse.ExportPartition.LargePartitions +version: 1.0 + +[ClickHouse] SHALL support exporting large partitions by: +* Handling partitions with large numbers of parts (e.g., hundreds or thousands of parts) +* Processing partitions with large numbers of rows (e.g., billions of rows) +* Processing large data volumes efficiently during export +* Maintaining data integrity when exporting large partitions +* Completing export operations successfully regardless of partition size +* Allowing export operations to continue over extended periods of time for very large partitions + +Production systems often have partitions containing very large amounts of data, and the export must handle these efficiently without timeouts or memory issues. + +## Export operation failure handling + +### RQ.ClickHouse.ExportPartition.RetryMechanism +version: 1.0 + +[ClickHouse] SHALL automatically retry failed part exports within a partition up to a configurable maximum retry count. If all retry attempts are exhausted for a part, the entire partition export operation SHALL be marked as failed. + +Unlike single-part exports, partition exports involve multiple parts and may take significant time. Retry mechanisms ensure that temporary failures don't require restarting the entire export operation. + +### RQ.ClickHouse.ExportPartition.Settings.MaxRetries +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_max_retries` setting that controls the maximum number of retries for exporting a merge tree part in an export partition task. The default value SHALL be `3`. + +This setting allows users to control how many times the system will retry exporting a part before marking it as failed. + +### RQ.ClickHouse.ExportPartition.ResumeAfterFailure +version: 1.0 + +[ClickHouse] SHALL allow export operations to resume after node failures or restarts. The system SHALL track which parts have been successfully exported and SHALL not re-export parts that were already successfully exported. + +### RQ.ClickHouse.ExportPartition.PartialProgress +version: 1.0 + +[ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed. + +### RQ.ClickHouse.ExportPartition.Cleanup +version: 1.0 + +[ClickHouse] SHALL automatically clean up failed or completed export operations after a configurable TTL period. + +### RQ.ClickHouse.ExportPartition.Settings.ManifestTTL +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_manifest_ttl` setting that determines how long the export manifest will be retained. This setting prevents the same partition from being exported twice to the same destination within the TTL period. The default value SHALL be `180` seconds. + +This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports. + +## Network resilience + +### RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues +version: 1.0 + +[ClickHouse] SHALL handle network packet issues during export operations by: +* Tolerating packet delay without data corruption or loss +* Handling packet loss and retransmitting data as needed +* Detecting and handling packet corruption to ensure data integrity +* Managing packet duplication without data duplication in destination +* Handling packet reordering to maintain correct data sequence +* Operating correctly under packet rate limiting constraints +* Completing exports successfully despite network impairments + +Network issues are common in distributed systems, and export operations must be resilient to ensure data integrity. + +### RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption +version: 1.0 + +[ClickHouse] SHALL handle destination storage interruptions during export operations by: +* Detecting when destination storage becomes unavailable during export +* Retrying failed part exports when destination storage becomes available again +* Logging failed exports in the `system.events` table with appropriate counters +* Not leaving partial or corrupted data in destination storage when exports fail due to destination unavailability +* Allowing exports to complete successfully once destination storage becomes available again +* Resuming export operations from the last successfully exported part + +Destination storage systems may experience temporary outages, and the export should automatically recover when service is restored. + +### RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption +version: 1.0 + +[ClickHouse] SHALL handle ClickHouse node interruptions during export operations by: +* Allowing export operations to resume after node restart without data loss or duplication +* Allowing other replicas to continue or complete export operations if a node fails +* Not leaving partial or corrupted data in destination storage when node restarts occur +* With safe shutdown, ensuring exports complete successfully before node shutdown when possible +* With unsafe shutdown, allowing export operations to resume from the last checkpoint after node restart +* Maintaining data integrity in destination storage regardless of node interruption type +* Ensuring that parts already exported are not re-exported after node restart + +Node failures are common in distributed systems, and export operations must be able to recover and continue without data loss or duplication. + +## Export operation restrictions + +### Preventing same table exports + +#### RQ.ClickHouse.ExportPartition.Restrictions.SameTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting partitions to the same table as the source by: +* Validating that source and destination table identifiers are different +* Throwing a `BAD_ARGUMENTS` exception with message "Exporting to the same table is not allowed" when source and destination are identical +* Performing this validation before any export processing begins + +Exporting to the same table would be redundant and could cause data duplication or conflicts. + +### Destination table compatibility + +#### RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport +version: 1.0 + +[ClickHouse] SHALL validate destination table compatibility by: + +* Checking that the destination storage supports importing MergeTree parts +* Verifying that the destination uses Hive partitioning strategy (`partition_strategy = 'hive'`) +* Throwing a `NOT_IMPLEMENTED` exception with message "Destination storage {} does not support MergeTree parts or uses unsupported partitioning" when requirements are not met +* Performing this validation during the initial export setup phase + +The destination must support the format and partitioning strategy required for exported data. + +### Local table restriction + +#### RQ.ClickHouse.ExportPartition.Restrictions.LocalTable +version: 1.0 + +[ClickHouse] SHALL prevent exporting partitions to local MergeTree tables by: +* Rejecting export operations where the destination table uses a MergeTree engine +* Throwing a `NOT_IMPLEMENTED` exception (error code 48) with message "Destination storage MergeTree does not support MergeTree parts or uses unsupported partitioning" when attempting to export to a local table +* Performing this validation during the initial export setup phase + +Export partition is designed to move data to object storage, not to local MergeTree tables. + +### Partition key compatibility + +#### RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey +version: 1.0 + +[ClickHouse] SHALL validate that source and destination tables have the same partition key expression by: +* Checking that the partition key expression matches between source and destination tables +* Throwing a `BAD_ARGUMENTS` exception (error code 36) with message "Tables have different partition key" when partition keys differ +* Performing this validation during the initial export setup phase + +Matching partition keys ensure that exported data is organized correctly in the destination storage. + +### Source partition availability + +#### RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition +version: 1.0 + +[ClickHouse] SHALL validate source partition availability by: +* Checking that the specified partition ID exists in the source table +* Verifying that the partition contains at least one active part (not detached or missing) +* Throwing an exception with an appropriate error message when the partition is not found or is empty +* Performing this validation before any export processing begins + +The system must verify that the partition exists and contains data before attempting to export it. + +## Export operation concurrency + +### RQ.ClickHouse.ExportPartition.Concurrency +version: 1.0 + +[ClickHouse] SHALL support concurrent export operations by: +* Allowing multiple partition exports to run simultaneously without interference +* Supporting concurrent exports of different partitions to different destinations +* Preventing concurrent exports of the same partition to the same destination +* Allowing different replicas to export different parts of the same partition concurrently +* Maintaining separate progress tracking for each concurrent operation + +Multiple users may want to export different partitions simultaneously, and the system must coordinate these operations to prevent conflicts while maximizing parallelism. + +## Export operation idempotency + +### RQ.ClickHouse.ExportPartition.Idempotency +version: 1.0 + +[ClickHouse] SHALL handle duplicate export operations by: +* Preventing duplicate data from being exported when the same partition is exported multiple times to the same destination +* Detecting when a partition export is already in progress or completed +* Detecting when an export operation attempts to export a partition that already exists in the destination +* Logging duplicate export attempts in the `system.events` table with appropriate counters +* Ensuring that destination data matches source data without duplication when the same partition is exported multiple times +* Allowing users to force re-export of a partition if needed (e.g., after TTL expiration or manual cleanup) + +Users may accidentally trigger the same export multiple times, and the system should prevent duplicate data while allowing legitimate re-exports when needed. + +### RQ.ClickHouse.ExportPartition.Settings.ForceExport +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_partition_force_export` setting that allows users to ignore existing partition export entries and force a new export operation. The default value SHALL be `false` (turned off). + +When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination. + +## Export operation logging + +### RQ.ClickHouse.ExportPartition.Logging +version: 1.0 + +[ClickHouse] SHALL provide detailed logging for export operations by: +* Logging all export operations (both successful and failed) with timestamps and details +* Recording the specific partition ID in the `system.part_log` table for all operations +* Logging export events in the `system.events` table, including: + * `PartsExports` - Number of successful part exports (within partitions) + * `PartsExportFailures` - Number of failed part exports + * `PartsExportDuplicated` - Number of part exports that failed because target already exists +* Writing operation information to the `system.part_log` table with `event_type` set to `EXPORT_PARTITION` +* Providing sufficient detail for monitoring and troubleshooting export operations +* Logging per-part export status within partition exports + +Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations. + +## Monitoring export operations + +### RQ.ClickHouse.ExportPartition.SystemTables.Exports +version: 1.0 + +[ClickHouse] SHALL provide a `system.replicated_partition_exports` table that allows users to monitor active partition export operations with at least the following columns: +* `source_table` - source table identifier +* `destination_table` - destination table identifier +* `partition_id` - the partition ID being exported +* `status` - current status of the export operation (e.g., PENDING, IN_PROGRESS, COMPLETED, FAILED) +* `parts_total` - total number of parts in the partition +* `parts_processed` - number of parts successfully exported +* `parts_failed` - number of parts that failed to export +* `create_time` - when the export operation was created +* `update_time` - last update time of the export operation + +The table SHALL track export operations before they complete and SHALL show completed or failed exports until they are cleaned up (based on TTL). + +Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster. + +## Enabling export functionality + +### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental +version: 1.0 + +[ClickHouse] SHALL support the `allow_experimental_export_merge_tree_part` setting that SHALL gate the experimental export partition functionality, which SHALL be set to `1` to enable `ALTER TABLE ... EXPORT PARTITION ID ...` commands. The default value SHALL be `0` (turned off). + +This setting allows administrators to control access to experimental functionality and ensures users are aware they are using a feature that may change. + +## Handling file conflicts during export + +### RQ.ClickHouse.ExportPartition.Settings.OverwriteFile +version: 1.0 + +[ClickHouse] SHALL support the `export_merge_tree_part_overwrite_file_if_exists` setting that controls whether to overwrite files if they already exist when exporting a partition. The default value SHALL be `0` (turned off), meaning exports will fail if files already exist in the destination. + +This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed. + +## Export operation configuration + +### RQ.ClickHouse.ExportPartition.ParallelFormatting +version: 1.0 + +[ClickHouse] SHALL support parallel formatting for export operations by: +* Automatically enabling parallel formatting for large export operations to improve performance +* Using the `output_format_parallel_formatting` setting to control parallel formatting behavior +* Optimizing data processing based on export size and system resources +* Providing consistent formatting performance across different export scenarios +* Allowing parallel processing of multiple parts within a partition when possible + +Parallel formatting improves export performance, especially for large partitions with many parts. + +## Controlling export performance + +### RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth +version: 1.0 + +[ClickHouse] SHALL support the `max_exports_bandwidth_for_server` server setting to limit the maximum read speed of all exports on the server in bytes per second, with `0` meaning unlimited bandwidth. The default value SHALL be `0`. This is a server-level setting configured in the server configuration file. + +Administrators need to control export bandwidth to avoid impacting other operations on the server. + +### RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize +version: 1.0 + +[ClickHouse] SHALL support the `background_move_pool_size` server setting to control the maximum number of threads that will be used for executing export operations in the background. The default value SHALL be `8`. This is a server-level setting configured in the server configuration file. + +This setting allows administrators to balance export performance with other system operations. + +### RQ.ClickHouse.ExportPartition.Metrics.Export +version: 1.0 + +[ClickHouse] SHALL provide the `Export` current metric in `system.metrics` table that tracks the number of currently executing partition exports. + +This metric helps monitor system load from export operations. + +## Export operation security + +### RQ.ClickHouse.ExportPartition.Security.RBAC +version: 1.0 + +[ClickHouse] SHALL enforce role-based access control (RBAC) for export operations. Users must have the following privileges to perform export operations: +* **Source Table**: `SELECT` privilege on the source table to read data parts +* **Destination Table**: `INSERT` privilege on the destination table to write exported data +* **Database Access**: `SHOW` privilege on both source and destination databases +* **System Tables**: `SELECT` privilege on `system.tables` and `system.replicated_partition_exports` to validate table existence and monitor exports + +Export operations move potentially sensitive data, and proper access controls ensure only authorized users can export partitions. + +### RQ.ClickHouse.ExportPartition.Security.DataEncryption +version: 1.0 + +[ClickHouse] SHALL encrypt all data in transit to destination storage using TLS/SSL during export operations. + +Data encryption protects sensitive information from being intercepted or accessed during transmission to destination storage. + +### RQ.ClickHouse.ExportPartition.Security.Network +version: 1.0 + +[ClickHouse] SHALL use secure connections to destination storage during export operations. For S3-compatible storage, connections must use HTTPS. For other storage types, secure protocols appropriate to the storage system must be used. + +Secure network connections prevent unauthorized access and ensure data integrity during export operations. + +### RQ.ClickHouse.ExportPartition.Security.CredentialManagement +version: 1.0 + +[ClickHouse] SHALL use secure credential storage for export operations and SHALL avoid exposing credentials in logs or error messages. + +Proper credential management prevents unauthorized access to destination storage systems and protects sensitive authentication information. + + +[ClickHouse]: https://clickhouse.com +""", +) From 105fb4f82cb21b93dafbfeebea095499b6e9eaef Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 16:53:45 +0400 Subject: [PATCH 92/99] update requirements --- s3/requirements/export_partition.md | 305 +++++++++-- s3/requirements/export_partition.py | 766 ++++++++++++++++++++++------ 2 files changed, 875 insertions(+), 196 deletions(-) diff --git a/s3/requirements/export_partition.md b/s3/requirements/export_partition.md index a671fcc73..0c2c2f64b 100644 --- a/s3/requirements/export_partition.md +++ b/s3/requirements/export_partition.md @@ -9,70 +9,78 @@ * 2.2 [RQ.ClickHouse.ExportPartition.EmptyPartition](#rqclickhouseexportpartitionemptypartition) * 3 [SQL command support](#sql-command-support) * 3.1 [RQ.ClickHouse.ExportPartition.SQLCommand](#rqclickhouseexportpartitionsqlcommand) + * 3.2 [RQ.ClickHouse.ExportPartition.IntoOutfile](#rqclickhouseexportpartitionintooutfile) + * 3.3 [RQ.ClickHouse.ExportPartition.Format](#rqclickhouseexportpartitionformat) + * 3.4 [RQ.ClickHouse.ExportPartition.SettingsClause](#rqclickhouseexportpartitionsettingsclause) * 4 [Supported source table engines](#supported-source-table-engines) * 4.1 [RQ.ClickHouse.ExportPartition.SourceEngines](#rqclickhouseexportpartitionsourceengines) * 5 [Cluster and node support](#cluster-and-node-support) * 5.1 [RQ.ClickHouse.ExportPartition.ClustersNodes](#rqclickhouseexportpartitionclustersnodes) + * 5.2 [RQ.ClickHouse.ExportPartition.Shards](#rqclickhouseexportpartitionshards) + * 5.3 [RQ.ClickHouse.ExportPartition.Versions](#rqclickhouseexportpartitionversions) * 6 [Supported source part storage types](#supported-source-part-storage-types) * 6.1 [RQ.ClickHouse.ExportPartition.SourcePartStorage](#rqclickhouseexportpartitionsourcepartstorage) * 7 [Storage policies and volumes](#storage-policies-and-volumes) * 7.1 [RQ.ClickHouse.ExportPartition.StoragePolicies](#rqclickhouseexportpartitionstoragepolicies) * 8 [Supported destination table engines](#supported-destination-table-engines) * 8.1 [RQ.ClickHouse.ExportPartition.DestinationEngines](#rqclickhouseexportpartitiondestinationengines) -* 9 [Schema compatibility](#schema-compatibility) - * 9.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) -* 10 [Partition key types support](#partition-key-types-support) - * 10.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) -* 11 [Partition content support](#partition-content-support) - * 11.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) - * 11.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) - * 11.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) -* 12 [Export operation failure handling](#export-operation-failure-handling) - * 12.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) - * 12.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) - * 12.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) - * 12.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) - * 12.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) - * 12.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) -* 13 [Network resilience](#network-resilience) - * 13.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) - * 13.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) - * 13.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) -* 14 [Export operation restrictions](#export-operation-restrictions) - * 14.1 [Preventing same table exports](#preventing-same-table-exports) - * 14.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) - * 14.2 [Destination table compatibility](#destination-table-compatibility) - * 14.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) - * 14.3 [Local table restriction](#local-table-restriction) - * 14.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) - * 14.4 [Partition key compatibility](#partition-key-compatibility) - * 14.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) - * 14.5 [Source partition availability](#source-partition-availability) - * 14.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) -* 15 [Export operation concurrency](#export-operation-concurrency) - * 15.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) -* 16 [Export operation idempotency](#export-operation-idempotency) - * 16.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) - * 16.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) -* 17 [Export operation logging](#export-operation-logging) - * 17.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) -* 18 [Monitoring export operations](#monitoring-export-operations) - * 18.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) -* 19 [Enabling export functionality](#enabling-export-functionality) - * 19.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) -* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 20.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) -* 21 [Export operation configuration](#export-operation-configuration) - * 21.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) -* 22 [Controlling export performance](#controlling-export-performance) - * 22.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) - * 22.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) - * 22.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) -* 23 [Export operation security](#export-operation-security) - * 23.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) - * 23.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) - * 23.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) - * 23.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) +* 9 [Temporary tables](#temporary-tables) + * 9.1 [RQ.ClickHouse.ExportPartition.TemporaryTable](#rqclickhouseexportpartitiontemporarytable) +* 10 [Schema compatibility](#schema-compatibility) + * 10.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) +* 11 [Partition key types support](#partition-key-types-support) + * 11.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) +* 12 [Partition content support](#partition-content-support) + * 12.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) + * 12.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) + * 12.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) + * 12.4 [RQ.ClickHouse.ExportPartition.Corrupted](#rqclickhouseexportpartitioncorrupted) +* 13 [Export operation failure handling](#export-operation-failure-handling) + * 13.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) + * 13.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) + * 13.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) + * 13.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) + * 13.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) + * 13.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) +* 14 [Network resilience](#network-resilience) + * 14.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) + * 14.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) + * 14.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) +* 15 [Export operation restrictions](#export-operation-restrictions) + * 15.1 [Preventing same table exports](#preventing-same-table-exports) + * 15.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) + * 15.2 [Destination table compatibility](#destination-table-compatibility) + * 15.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) + * 15.3 [Local table restriction](#local-table-restriction) + * 15.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) + * 15.4 [Partition key compatibility](#partition-key-compatibility) + * 15.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) + * 15.5 [Source partition availability](#source-partition-availability) + * 15.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) +* 16 [Export operation concurrency](#export-operation-concurrency) + * 16.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) +* 17 [Export operation idempotency](#export-operation-idempotency) + * 17.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) + * 17.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) +* 18 [Export operation logging](#export-operation-logging) + * 18.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) +* 19 [Monitoring export operations](#monitoring-export-operations) + * 19.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) +* 20 [Enabling export functionality](#enabling-export-functionality) + * 20.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) +* 21 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 21.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) +* 22 [Export operation configuration](#export-operation-configuration) + * 22.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) +* 23 [Controlling export performance](#controlling-export-performance) + * 23.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) + * 23.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) + * 23.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) +* 24 [Export operation security](#export-operation-security) + * 24.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) + * 24.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) + * 24.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) + * 24.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) ## Introduction @@ -116,6 +124,51 @@ SETTINGS allow_experimental_export_merge_tree_part = 1 This command allows users to export entire partitions in a single operation, which is more efficient than exporting individual parts and ensures all data for a partition is exported together. +### RQ.ClickHouse.ExportPartition.IntoOutfile +version: 1.0 + +[ClickHouse] SHALL support the usage of the `INTO OUTFILE` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +INTO OUTFILE '/path/to/file' +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +### RQ.ClickHouse.ExportPartition.Format +version: 1.0 + +[ClickHouse] SHALL support the usage of the `FORMAT` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +FORMAT JSON +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +### RQ.ClickHouse.ExportPartition.SettingsClause +version: 1.0 + +[ClickHouse] SHALL support the usage of the `SETTINGS` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_max_retries = 5 +``` + ## Supported source table engines ### RQ.ClickHouse.ExportPartition.SourceEngines @@ -146,6 +199,18 @@ version: 1.0 In a replicated cluster, different parts of the same partition may exist on different replicas. The system must coordinate exports across all replicas to ensure complete partition export without duplication. +### RQ.ClickHouse.ExportPartition.Shards +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from source tables that are on different shards than the destination table. + +### RQ.ClickHouse.ExportPartition.Versions +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from source tables that are stored on servers with different ClickHouse versions than the destination server. + +Users can export partitions from tables on servers with older ClickHouse versions to tables on servers with newer versions, enabling data migration and version upgrades. + ## Supported source part storage types ### RQ.ClickHouse.ExportPartition.SourcePartStorage @@ -192,6 +257,28 @@ version: 1.0 Export partition is designed to move data from local or replicated storage to object storage systems for long-term storage, analytics, or data sharing purposes. +## Temporary tables + +### RQ.ClickHouse.ExportPartition.TemporaryTable +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from temporary ReplicatedMergeTree tables to destination object storage tables. + +For example, + +```sql +CREATE TEMPORARY TABLE temp_table (p UInt64, k String, d UInt64) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/temp_table', '{replica}') +PARTITION BY p ORDER BY k; + +INSERT INTO temp_table VALUES (2020, 'key1', 100), (2020, 'key2', 200); + +ALTER TABLE temp_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ## Schema compatibility ### RQ.ClickHouse.ExportPartition.SchemaCompatibility @@ -265,6 +352,13 @@ version: 1.0 Production systems often have partitions containing very large amounts of data, and the export must handle these efficiently without timeouts or memory issues. +### RQ.ClickHouse.ExportPartition.Corrupted +version: 1.0 + +[ClickHouse] SHALL output an error and prevent export operations from proceeding when trying to export a partition that contains corrupted parts in the source table. + +The system SHALL detect corruption in partitions containing compact parts, wide parts, or mixed part types. + ## Export operation failure handling ### RQ.ClickHouse.ExportPartition.RetryMechanism @@ -281,6 +375,16 @@ version: 1.0 This setting allows users to control how many times the system will retry exporting a part before marking it as failed. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_max_retries = 5 +``` + ### RQ.ClickHouse.ExportPartition.ResumeAfterFailure version: 1.0 @@ -291,6 +395,15 @@ version: 1.0 [ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed. +For example, users can query the export status to see partial progress: + +```sql +SELECT source_table, destination_table, partition_id, status, + parts_total, parts_processed, parts_failed +FROM system.replicated_partition_exports +WHERE partition_id = '2020' +``` + ### RQ.ClickHouse.ExportPartition.Cleanup version: 1.0 @@ -303,6 +416,16 @@ version: 1.0 This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_manifest_ttl = 360 +``` + ## Network resilience ### RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues @@ -360,6 +483,15 @@ version: 1.0 Exporting to the same table would be redundant and could cause data duplication or conflicts. +For example, the following command SHALL output an error: + +```sql +ALTER TABLE my_table +EXPORT PARTITION ID '2020' +TO TABLE my_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Destination table compatibility #### RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport @@ -386,6 +518,15 @@ version: 1.0 Export partition is designed to move data to object storage, not to local MergeTree tables. +For example, if `local_table` is a MergeTree table, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE local_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Partition key compatibility #### RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey @@ -398,6 +539,15 @@ version: 1.0 Matching partition keys ensure that exported data is organized correctly in the destination storage. +For example, if `source_table` is partitioned by `toYYYYMM(date)` and `destination_table` is partitioned by `toYYYYMMDD(date)`, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Source partition availability #### RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition @@ -411,6 +561,15 @@ version: 1.0 The system must verify that the partition exists and contains data before attempting to export it. +For example, if partition ID '2025' does not exist in `source_table`, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2025' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ## Export operation concurrency ### RQ.ClickHouse.ExportPartition.Concurrency @@ -447,6 +606,16 @@ version: 1.0 When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_force_export = 1 +``` + ## Export operation logging ### RQ.ClickHouse.ExportPartition.Logging @@ -465,6 +634,15 @@ version: 1.0 Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations. +For example, users can query export logs: + +```sql +SELECT event_time, event_type, table, partition, rows, bytes_read, bytes_written +FROM system.part_log +WHERE event_type = 'EXPORT_PARTITION' +ORDER BY event_time DESC +``` + ## Monitoring export operations ### RQ.ClickHouse.ExportPartition.SystemTables.Exports @@ -485,6 +663,15 @@ The table SHALL track export operations before they complete and SHALL show comp Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster. +For example, + +```sql +SELECT source_table, destination_table, partition_id, status, + parts_total, parts_processed, parts_failed, create_time, update_time +FROM system.replicated_partition_exports +WHERE status = 'IN_PROGRESS' +``` + ## Enabling export functionality ### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental @@ -503,6 +690,16 @@ version: 1.0 This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_part_overwrite_file_if_exists = 1 +``` + ## Export operation configuration ### RQ.ClickHouse.ExportPartition.ParallelFormatting diff --git a/s3/requirements/export_partition.py b/s3/requirements/export_partition.py index 683e6dfb0..8e218fbdb 100644 --- a/s3/requirements/export_partition.py +++ b/s3/requirements/export_partition.py @@ -75,6 +75,84 @@ num="3.1", ) +RQ_ClickHouse_ExportPartition_IntoOutfile = Requirement( + name="RQ.ClickHouse.ExportPartition.IntoOutfile", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the usage of the `INTO OUTFILE` clause with `EXPORT PARTITION` and SHALL not output any errors.\n" + "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "INTO OUTFILE '/path/to/file'\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" + ), + link=None, + level=2, + num="3.2", +) + +RQ_ClickHouse_ExportPartition_Format = Requirement( + name="RQ.ClickHouse.ExportPartition.Format", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the usage of the `FORMAT` clause with `EXPORT PARTITION` and SHALL not output any errors.\n" + "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "FORMAT JSON\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" + ), + link=None, + level=2, + num="3.3", +) + +RQ_ClickHouse_ExportPartition_SettingsClause = Requirement( + name="RQ.ClickHouse.ExportPartition.SettingsClause", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support the usage of the `SETTINGS` clause with `EXPORT PARTITION` and SHALL not output any errors.\n" + "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1, \n" + " export_merge_tree_partition_max_retries = 5\n" + "```\n" + "\n" + ), + link=None, + level=2, + num="3.4", +) + RQ_ClickHouse_ExportPartition_SourceEngines = Requirement( name="RQ.ClickHouse.ExportPartition.SourceEngines", version="1.0", @@ -123,6 +201,40 @@ num="5.1", ) +RQ_ClickHouse_ExportPartition_Shards = Requirement( + name="RQ.ClickHouse.ExportPartition.Shards", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from source tables that are on different shards than the destination table.\n" + "\n" + ), + link=None, + level=2, + num="5.2", +) + +RQ_ClickHouse_ExportPartition_Versions = Requirement( + name="RQ.ClickHouse.ExportPartition.Versions", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from source tables that are stored on servers with different ClickHouse versions than the destination server.\n" + "\n" + "Users can export partitions from tables on servers with older ClickHouse versions to tables on servers with newer versions, enabling data migration and version upgrades.\n" + "\n" + ), + link=None, + level=2, + num="5.3", +) + RQ_ClickHouse_ExportPartition_SourcePartStorage = Requirement( name="RQ.ClickHouse.ExportPartition.SourcePartStorage", version="1.0", @@ -196,6 +308,37 @@ num="8.1", ) +RQ_ClickHouse_ExportPartition_TemporaryTable = Requirement( + name="RQ.ClickHouse.ExportPartition.TemporaryTable", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL support exporting partitions from temporary ReplicatedMergeTree tables to destination object storage tables.\n" + "\n" + "For example,\n" + "\n" + "```sql\n" + "CREATE TEMPORARY TABLE temp_table (p UInt64, k String, d UInt64) \n" + "ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/temp_table', '{replica}') \n" + "PARTITION BY p ORDER BY k;\n" + "\n" + "INSERT INTO temp_table VALUES (2020, 'key1', 100), (2020, 'key2', 200);\n" + "\n" + "ALTER TABLE temp_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" + ), + link=None, + level=2, + num="9.1", +) + RQ_ClickHouse_ExportPartition_SchemaCompatibility = Requirement( name="RQ.ClickHouse.ExportPartition.SchemaCompatibility", version="1.0", @@ -215,7 +358,7 @@ ), link=None, level=2, - num="9.1", + num="10.1", ) RQ_ClickHouse_ExportPartition_PartitionKeyTypes = Requirement( @@ -244,7 +387,7 @@ ), link=None, level=2, - num="10.1", + num="11.1", ) RQ_ClickHouse_ExportPartition_PartitionContent = Requirement( @@ -269,7 +412,7 @@ ), link=None, level=2, - num="11.1", + num="12.1", ) RQ_ClickHouse_ExportPartition_SchemaChangeIsolation = Requirement( @@ -291,7 +434,7 @@ ), link=None, level=2, - num="11.2", + num="12.2", ) RQ_ClickHouse_ExportPartition_LargePartitions = Requirement( @@ -315,7 +458,25 @@ ), link=None, level=2, - num="11.3", + num="12.3", +) + +RQ_ClickHouse_ExportPartition_Corrupted = Requirement( + name="RQ.ClickHouse.ExportPartition.Corrupted", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL output an error and prevent export operations from proceeding when trying to export a partition that contains corrupted parts in the source table.\n" + "\n" + "The system SHALL detect corruption in partitions containing compact parts, wide parts, or mixed part types.\n" + "\n" + ), + link=None, + level=2, + num="12.4", ) RQ_ClickHouse_ExportPartition_RetryMechanism = Requirement( @@ -333,7 +494,7 @@ ), link=None, level=2, - num="12.1", + num="13.1", ) RQ_ClickHouse_ExportPartition_Settings_MaxRetries = Requirement( @@ -348,10 +509,20 @@ "\n" "This setting allows users to control how many times the system will retry exporting a part before marking it as failed.\n" "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1,\n" + " export_merge_tree_partition_max_retries = 5\n" + "```\n" + "\n" ), link=None, level=2, - num="12.2", + num="13.2", ) RQ_ClickHouse_ExportPartition_ResumeAfterFailure = Requirement( @@ -367,7 +538,7 @@ ), link=None, level=2, - num="12.3", + num="13.3", ) RQ_ClickHouse_ExportPartition_PartialProgress = Requirement( @@ -380,10 +551,19 @@ description=( "[ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed.\n" "\n" + "For example, users can query the export status to see partial progress:\n" + "\n" + "```sql\n" + "SELECT source_table, destination_table, partition_id, status,\n" + " parts_total, parts_processed, parts_failed\n" + "FROM system.replicated_partition_exports\n" + "WHERE partition_id = '2020'\n" + "```\n" + "\n" ), link=None, level=2, - num="12.4", + num="13.4", ) RQ_ClickHouse_ExportPartition_Cleanup = Requirement( @@ -399,7 +579,7 @@ ), link=None, level=2, - num="12.5", + num="13.5", ) RQ_ClickHouse_ExportPartition_Settings_ManifestTTL = Requirement( @@ -414,10 +594,20 @@ "\n" "This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports.\n" "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1,\n" + " export_merge_tree_partition_manifest_ttl = 360\n" + "```\n" + "\n" ), link=None, level=2, - num="12.6", + num="13.6", ) RQ_ClickHouse_ExportPartition_NetworkResilience_PacketIssues = Requirement( @@ -442,7 +632,7 @@ ), link=None, level=2, - num="13.1", + num="14.1", ) RQ_ClickHouse_ExportPartition_NetworkResilience_DestinationInterruption = Requirement( @@ -466,7 +656,7 @@ ), link=None, level=2, - num="13.2", + num="14.2", ) RQ_ClickHouse_ExportPartition_NetworkResilience_NodeInterruption = Requirement( @@ -491,7 +681,7 @@ ), link=None, level=2, - num="13.3", + num="14.3", ) RQ_ClickHouse_ExportPartition_Restrictions_SameTable = Requirement( @@ -509,10 +699,19 @@ "\n" "Exporting to the same table would be redundant and could cause data duplication or conflicts.\n" "\n" + "For example, the following command SHALL output an error:\n" + "\n" + "```sql\n" + "ALTER TABLE my_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE my_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" ), link=None, level=3, - num="14.1.1", + num="15.1.1", ) RQ_ClickHouse_ExportPartition_Restrictions_DestinationSupport = Requirement( @@ -535,7 +734,7 @@ ), link=None, level=3, - num="14.2.1", + num="15.2.1", ) RQ_ClickHouse_ExportPartition_Restrictions_LocalTable = Requirement( @@ -553,10 +752,19 @@ "\n" "Export partition is designed to move data to object storage, not to local MergeTree tables.\n" "\n" + "For example, if `local_table` is a MergeTree table, the following command SHALL output an error:\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE local_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" ), link=None, level=3, - num="14.3.1", + num="15.3.1", ) RQ_ClickHouse_ExportPartition_Restrictions_PartitionKey = Requirement( @@ -574,10 +782,19 @@ "\n" "Matching partition keys ensure that exported data is organized correctly in the destination storage.\n" "\n" + "For example, if `source_table` is partitioned by `toYYYYMM(date)` and `destination_table` is partitioned by `toYYYYMMDD(date)`, the following command SHALL output an error:\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" ), link=None, level=3, - num="14.4.1", + num="15.4.1", ) RQ_ClickHouse_ExportPartition_Restrictions_SourcePartition = Requirement( @@ -596,10 +813,19 @@ "\n" "The system must verify that the partition exists and contains data before attempting to export it.\n" "\n" + "For example, if partition ID '2025' does not exist in `source_table`, the following command SHALL output an error:\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2025' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1\n" + "```\n" + "\n" ), link=None, level=3, - num="14.5.1", + num="15.5.1", ) RQ_ClickHouse_ExportPartition_Concurrency = Requirement( @@ -622,7 +848,7 @@ ), link=None, level=2, - num="15.1", + num="16.1", ) RQ_ClickHouse_ExportPartition_Idempotency = Requirement( @@ -646,7 +872,7 @@ ), link=None, level=2, - num="16.1", + num="17.1", ) RQ_ClickHouse_ExportPartition_Settings_ForceExport = Requirement( @@ -661,10 +887,20 @@ "\n" "When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination.\n" "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1,\n" + " export_merge_tree_partition_force_export = 1\n" + "```\n" + "\n" ), link=None, level=2, - num="16.2", + num="17.2", ) RQ_ClickHouse_ExportPartition_Logging = Requirement( @@ -688,10 +924,19 @@ "\n" "Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations.\n" "\n" + "For example, users can query export logs:\n" + "\n" + "```sql\n" + "SELECT event_time, event_type, table, partition, rows, bytes_read, bytes_written\n" + "FROM system.part_log\n" + "WHERE event_type = 'EXPORT_PARTITION'\n" + "ORDER BY event_time DESC\n" + "```\n" + "\n" ), link=None, level=2, - num="17.1", + num="18.1", ) RQ_ClickHouse_ExportPartition_SystemTables_Exports = Requirement( @@ -717,10 +962,19 @@ "\n" "Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster.\n" "\n" + "For example,\n" + "\n" + "```sql\n" + "SELECT source_table, destination_table, partition_id, status, \n" + " parts_total, parts_processed, parts_failed, create_time, update_time\n" + "FROM system.replicated_partition_exports\n" + "WHERE status = 'IN_PROGRESS'\n" + "```\n" + "\n" ), link=None, level=2, - num="18.1", + num="19.1", ) RQ_ClickHouse_ExportPartition_Settings_AllowExperimental = Requirement( @@ -738,7 +992,7 @@ ), link=None, level=2, - num="19.1", + num="20.1", ) RQ_ClickHouse_ExportPartition_Settings_OverwriteFile = Requirement( @@ -753,10 +1007,20 @@ "\n" "This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed.\n" "\n" + "For example,\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "SETTINGS allow_experimental_export_merge_tree_part = 1,\n" + " export_merge_tree_part_overwrite_file_if_exists = 1\n" + "```\n" + "\n" ), link=None, level=2, - num="20.1", + num="21.1", ) RQ_ClickHouse_ExportPartition_ParallelFormatting = Requirement( @@ -779,7 +1043,7 @@ ), link=None, level=2, - num="21.1", + num="22.1", ) RQ_ClickHouse_ExportPartition_ServerSettings_MaxBandwidth = Requirement( @@ -797,7 +1061,7 @@ ), link=None, level=2, - num="22.1", + num="23.1", ) RQ_ClickHouse_ExportPartition_ServerSettings_BackgroundMovePoolSize = Requirement( @@ -815,7 +1079,7 @@ ), link=None, level=2, - num="22.2", + num="23.2", ) RQ_ClickHouse_ExportPartition_Metrics_Export = Requirement( @@ -833,7 +1097,7 @@ ), link=None, level=2, - num="22.3", + num="23.3", ) RQ_ClickHouse_ExportPartition_Security_RBAC = Requirement( @@ -855,7 +1119,7 @@ ), link=None, level=2, - num="23.1", + num="24.1", ) RQ_ClickHouse_ExportPartition_Security_DataEncryption = Requirement( @@ -873,7 +1137,7 @@ ), link=None, level=2, - num="23.2", + num="24.2", ) RQ_ClickHouse_ExportPartition_Security_Network = Requirement( @@ -891,7 +1155,7 @@ ), link=None, level=2, - num="23.3", + num="24.3", ) RQ_ClickHouse_ExportPartition_Security_CredentialManagement = Requirement( @@ -912,7 +1176,7 @@ ), link=None, level=2, - num="23.4", + num="24.4", ) SRS_016_ClickHouse_Export_Partition_to_S3 = Specification( @@ -940,10 +1204,17 @@ ), Heading(name="SQL command support", level=1, num="3"), Heading(name="RQ.ClickHouse.ExportPartition.SQLCommand", level=2, num="3.1"), + Heading(name="RQ.ClickHouse.ExportPartition.IntoOutfile", level=2, num="3.2"), + Heading(name="RQ.ClickHouse.ExportPartition.Format", level=2, num="3.3"), + Heading( + name="RQ.ClickHouse.ExportPartition.SettingsClause", level=2, num="3.4" + ), Heading(name="Supported source table engines", level=1, num="4"), Heading(name="RQ.ClickHouse.ExportPartition.SourceEngines", level=2, num="4.1"), Heading(name="Cluster and node support", level=1, num="5"), Heading(name="RQ.ClickHouse.ExportPartition.ClustersNodes", level=2, num="5.1"), + Heading(name="RQ.ClickHouse.ExportPartition.Shards", level=2, num="5.2"), + Heading(name="RQ.ClickHouse.ExportPartition.Versions", level=2, num="5.3"), Heading(name="Supported source part storage types", level=1, num="6"), Heading( name="RQ.ClickHouse.ExportPartition.SourcePartStorage", level=2, num="6.1" @@ -956,173 +1227,187 @@ Heading( name="RQ.ClickHouse.ExportPartition.DestinationEngines", level=2, num="8.1" ), - Heading(name="Schema compatibility", level=1, num="9"), + Heading(name="Temporary tables", level=1, num="9"), Heading( - name="RQ.ClickHouse.ExportPartition.SchemaCompatibility", level=2, num="9.1" + name="RQ.ClickHouse.ExportPartition.TemporaryTable", level=2, num="9.1" + ), + Heading(name="Schema compatibility", level=1, num="10"), + Heading( + name="RQ.ClickHouse.ExportPartition.SchemaCompatibility", + level=2, + num="10.1", ), - Heading(name="Partition key types support", level=1, num="10"), + Heading(name="Partition key types support", level=1, num="11"), Heading( - name="RQ.ClickHouse.ExportPartition.PartitionKeyTypes", level=2, num="10.1" + name="RQ.ClickHouse.ExportPartition.PartitionKeyTypes", level=2, num="11.1" ), - Heading(name="Partition content support", level=1, num="11"), + Heading(name="Partition content support", level=1, num="12"), Heading( - name="RQ.ClickHouse.ExportPartition.PartitionContent", level=2, num="11.1" + name="RQ.ClickHouse.ExportPartition.PartitionContent", level=2, num="12.1" ), Heading( name="RQ.ClickHouse.ExportPartition.SchemaChangeIsolation", level=2, - num="11.2", + num="12.2", ), Heading( - name="RQ.ClickHouse.ExportPartition.LargePartitions", level=2, num="11.3" + name="RQ.ClickHouse.ExportPartition.LargePartitions", level=2, num="12.3" ), - Heading(name="Export operation failure handling", level=1, num="12"), + Heading(name="RQ.ClickHouse.ExportPartition.Corrupted", level=2, num="12.4"), + Heading(name="Export operation failure handling", level=1, num="13"), Heading( - name="RQ.ClickHouse.ExportPartition.RetryMechanism", level=2, num="12.1" + name="RQ.ClickHouse.ExportPartition.RetryMechanism", level=2, num="13.1" ), Heading( name="RQ.ClickHouse.ExportPartition.Settings.MaxRetries", level=2, - num="12.2", + num="13.2", ), Heading( - name="RQ.ClickHouse.ExportPartition.ResumeAfterFailure", level=2, num="12.3" + name="RQ.ClickHouse.ExportPartition.ResumeAfterFailure", level=2, num="13.3" ), Heading( - name="RQ.ClickHouse.ExportPartition.PartialProgress", level=2, num="12.4" + name="RQ.ClickHouse.ExportPartition.PartialProgress", level=2, num="13.4" ), - Heading(name="RQ.ClickHouse.ExportPartition.Cleanup", level=2, num="12.5"), + Heading(name="RQ.ClickHouse.ExportPartition.Cleanup", level=2, num="13.5"), Heading( name="RQ.ClickHouse.ExportPartition.Settings.ManifestTTL", level=2, - num="12.6", + num="13.6", ), - Heading(name="Network resilience", level=1, num="13"), + Heading(name="Network resilience", level=1, num="14"), Heading( name="RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues", level=2, - num="13.1", + num="14.1", ), Heading( name="RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption", level=2, - num="13.2", + num="14.2", ), Heading( name="RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption", level=2, - num="13.3", + num="14.3", ), - Heading(name="Export operation restrictions", level=1, num="14"), - Heading(name="Preventing same table exports", level=2, num="14.1"), + Heading(name="Export operation restrictions", level=1, num="15"), + Heading(name="Preventing same table exports", level=2, num="15.1"), Heading( name="RQ.ClickHouse.ExportPartition.Restrictions.SameTable", level=3, - num="14.1.1", + num="15.1.1", ), - Heading(name="Destination table compatibility", level=2, num="14.2"), + Heading(name="Destination table compatibility", level=2, num="15.2"), Heading( name="RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport", level=3, - num="14.2.1", + num="15.2.1", ), - Heading(name="Local table restriction", level=2, num="14.3"), + Heading(name="Local table restriction", level=2, num="15.3"), Heading( name="RQ.ClickHouse.ExportPartition.Restrictions.LocalTable", level=3, - num="14.3.1", + num="15.3.1", ), - Heading(name="Partition key compatibility", level=2, num="14.4"), + Heading(name="Partition key compatibility", level=2, num="15.4"), Heading( name="RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey", level=3, - num="14.4.1", + num="15.4.1", ), - Heading(name="Source partition availability", level=2, num="14.5"), + Heading(name="Source partition availability", level=2, num="15.5"), Heading( name="RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition", level=3, - num="14.5.1", + num="15.5.1", ), - Heading(name="Export operation concurrency", level=1, num="15"), - Heading(name="RQ.ClickHouse.ExportPartition.Concurrency", level=2, num="15.1"), - Heading(name="Export operation idempotency", level=1, num="16"), - Heading(name="RQ.ClickHouse.ExportPartition.Idempotency", level=2, num="16.1"), + Heading(name="Export operation concurrency", level=1, num="16"), + Heading(name="RQ.ClickHouse.ExportPartition.Concurrency", level=2, num="16.1"), + Heading(name="Export operation idempotency", level=1, num="17"), + Heading(name="RQ.ClickHouse.ExportPartition.Idempotency", level=2, num="17.1"), Heading( name="RQ.ClickHouse.ExportPartition.Settings.ForceExport", level=2, - num="16.2", + num="17.2", ), - Heading(name="Export operation logging", level=1, num="17"), - Heading(name="RQ.ClickHouse.ExportPartition.Logging", level=2, num="17.1"), - Heading(name="Monitoring export operations", level=1, num="18"), + Heading(name="Export operation logging", level=1, num="18"), + Heading(name="RQ.ClickHouse.ExportPartition.Logging", level=2, num="18.1"), + Heading(name="Monitoring export operations", level=1, num="19"), Heading( name="RQ.ClickHouse.ExportPartition.SystemTables.Exports", level=2, - num="18.1", + num="19.1", ), - Heading(name="Enabling export functionality", level=1, num="19"), + Heading(name="Enabling export functionality", level=1, num="20"), Heading( name="RQ.ClickHouse.ExportPartition.Settings.AllowExperimental", level=2, - num="19.1", + num="20.1", ), - Heading(name="Handling file conflicts during export", level=1, num="20"), + Heading(name="Handling file conflicts during export", level=1, num="21"), Heading( name="RQ.ClickHouse.ExportPartition.Settings.OverwriteFile", level=2, - num="20.1", + num="21.1", ), - Heading(name="Export operation configuration", level=1, num="21"), + Heading(name="Export operation configuration", level=1, num="22"), Heading( - name="RQ.ClickHouse.ExportPartition.ParallelFormatting", level=2, num="21.1" + name="RQ.ClickHouse.ExportPartition.ParallelFormatting", level=2, num="22.1" ), - Heading(name="Controlling export performance", level=1, num="22"), + Heading(name="Controlling export performance", level=1, num="23"), Heading( name="RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth", level=2, - num="22.1", + num="23.1", ), Heading( name="RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize", level=2, - num="22.2", + num="23.2", ), Heading( - name="RQ.ClickHouse.ExportPartition.Metrics.Export", level=2, num="22.3" + name="RQ.ClickHouse.ExportPartition.Metrics.Export", level=2, num="23.3" ), - Heading(name="Export operation security", level=1, num="23"), + Heading(name="Export operation security", level=1, num="24"), Heading( - name="RQ.ClickHouse.ExportPartition.Security.RBAC", level=2, num="23.1" + name="RQ.ClickHouse.ExportPartition.Security.RBAC", level=2, num="24.1" ), Heading( name="RQ.ClickHouse.ExportPartition.Security.DataEncryption", level=2, - num="23.2", + num="24.2", ), Heading( - name="RQ.ClickHouse.ExportPartition.Security.Network", level=2, num="23.3" + name="RQ.ClickHouse.ExportPartition.Security.Network", level=2, num="24.3" ), Heading( name="RQ.ClickHouse.ExportPartition.Security.CredentialManagement", level=2, - num="23.4", + num="24.4", ), ), requirements=( RQ_ClickHouse_ExportPartition_S3, RQ_ClickHouse_ExportPartition_EmptyPartition, RQ_ClickHouse_ExportPartition_SQLCommand, + RQ_ClickHouse_ExportPartition_IntoOutfile, + RQ_ClickHouse_ExportPartition_Format, + RQ_ClickHouse_ExportPartition_SettingsClause, RQ_ClickHouse_ExportPartition_SourceEngines, RQ_ClickHouse_ExportPartition_ClustersNodes, + RQ_ClickHouse_ExportPartition_Shards, + RQ_ClickHouse_ExportPartition_Versions, RQ_ClickHouse_ExportPartition_SourcePartStorage, RQ_ClickHouse_ExportPartition_StoragePolicies, RQ_ClickHouse_ExportPartition_DestinationEngines, + RQ_ClickHouse_ExportPartition_TemporaryTable, RQ_ClickHouse_ExportPartition_SchemaCompatibility, RQ_ClickHouse_ExportPartition_PartitionKeyTypes, RQ_ClickHouse_ExportPartition_PartitionContent, RQ_ClickHouse_ExportPartition_SchemaChangeIsolation, RQ_ClickHouse_ExportPartition_LargePartitions, + RQ_ClickHouse_ExportPartition_Corrupted, RQ_ClickHouse_ExportPartition_RetryMechanism, RQ_ClickHouse_ExportPartition_Settings_MaxRetries, RQ_ClickHouse_ExportPartition_ResumeAfterFailure, @@ -1165,70 +1450,78 @@ * 2.2 [RQ.ClickHouse.ExportPartition.EmptyPartition](#rqclickhouseexportpartitionemptypartition) * 3 [SQL command support](#sql-command-support) * 3.1 [RQ.ClickHouse.ExportPartition.SQLCommand](#rqclickhouseexportpartitionsqlcommand) + * 3.2 [RQ.ClickHouse.ExportPartition.IntoOutfile](#rqclickhouseexportpartitionintooutfile) + * 3.3 [RQ.ClickHouse.ExportPartition.Format](#rqclickhouseexportpartitionformat) + * 3.4 [RQ.ClickHouse.ExportPartition.SettingsClause](#rqclickhouseexportpartitionsettingsclause) * 4 [Supported source table engines](#supported-source-table-engines) * 4.1 [RQ.ClickHouse.ExportPartition.SourceEngines](#rqclickhouseexportpartitionsourceengines) * 5 [Cluster and node support](#cluster-and-node-support) * 5.1 [RQ.ClickHouse.ExportPartition.ClustersNodes](#rqclickhouseexportpartitionclustersnodes) + * 5.2 [RQ.ClickHouse.ExportPartition.Shards](#rqclickhouseexportpartitionshards) + * 5.3 [RQ.ClickHouse.ExportPartition.Versions](#rqclickhouseexportpartitionversions) * 6 [Supported source part storage types](#supported-source-part-storage-types) * 6.1 [RQ.ClickHouse.ExportPartition.SourcePartStorage](#rqclickhouseexportpartitionsourcepartstorage) * 7 [Storage policies and volumes](#storage-policies-and-volumes) * 7.1 [RQ.ClickHouse.ExportPartition.StoragePolicies](#rqclickhouseexportpartitionstoragepolicies) * 8 [Supported destination table engines](#supported-destination-table-engines) * 8.1 [RQ.ClickHouse.ExportPartition.DestinationEngines](#rqclickhouseexportpartitiondestinationengines) -* 9 [Schema compatibility](#schema-compatibility) - * 9.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) -* 10 [Partition key types support](#partition-key-types-support) - * 10.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) -* 11 [Partition content support](#partition-content-support) - * 11.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) - * 11.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) - * 11.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) -* 12 [Export operation failure handling](#export-operation-failure-handling) - * 12.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) - * 12.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) - * 12.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) - * 12.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) - * 12.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) - * 12.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) -* 13 [Network resilience](#network-resilience) - * 13.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) - * 13.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) - * 13.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) -* 14 [Export operation restrictions](#export-operation-restrictions) - * 14.1 [Preventing same table exports](#preventing-same-table-exports) - * 14.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) - * 14.2 [Destination table compatibility](#destination-table-compatibility) - * 14.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) - * 14.3 [Local table restriction](#local-table-restriction) - * 14.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) - * 14.4 [Partition key compatibility](#partition-key-compatibility) - * 14.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) - * 14.5 [Source partition availability](#source-partition-availability) - * 14.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) -* 15 [Export operation concurrency](#export-operation-concurrency) - * 15.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) -* 16 [Export operation idempotency](#export-operation-idempotency) - * 16.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) - * 16.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) -* 17 [Export operation logging](#export-operation-logging) - * 17.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) -* 18 [Monitoring export operations](#monitoring-export-operations) - * 18.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) -* 19 [Enabling export functionality](#enabling-export-functionality) - * 19.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) -* 20 [Handling file conflicts during export](#handling-file-conflicts-during-export) - * 20.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) -* 21 [Export operation configuration](#export-operation-configuration) - * 21.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) -* 22 [Controlling export performance](#controlling-export-performance) - * 22.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) - * 22.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) - * 22.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) -* 23 [Export operation security](#export-operation-security) - * 23.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) - * 23.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) - * 23.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) - * 23.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) +* 9 [Temporary tables](#temporary-tables) + * 9.1 [RQ.ClickHouse.ExportPartition.TemporaryTable](#rqclickhouseexportpartitiontemporarytable) +* 10 [Schema compatibility](#schema-compatibility) + * 10.1 [RQ.ClickHouse.ExportPartition.SchemaCompatibility](#rqclickhouseexportpartitionschemacompatibility) +* 11 [Partition key types support](#partition-key-types-support) + * 11.1 [RQ.ClickHouse.ExportPartition.PartitionKeyTypes](#rqclickhouseexportpartitionpartitionkeytypes) +* 12 [Partition content support](#partition-content-support) + * 12.1 [RQ.ClickHouse.ExportPartition.PartitionContent](#rqclickhouseexportpartitionpartitioncontent) + * 12.2 [RQ.ClickHouse.ExportPartition.SchemaChangeIsolation](#rqclickhouseexportpartitionschemachangeisolation) + * 12.3 [RQ.ClickHouse.ExportPartition.LargePartitions](#rqclickhouseexportpartitionlargepartitions) + * 12.4 [RQ.ClickHouse.ExportPartition.Corrupted](#rqclickhouseexportpartitioncorrupted) +* 13 [Export operation failure handling](#export-operation-failure-handling) + * 13.1 [RQ.ClickHouse.ExportPartition.RetryMechanism](#rqclickhouseexportpartitionretrymechanism) + * 13.2 [RQ.ClickHouse.ExportPartition.Settings.MaxRetries](#rqclickhouseexportpartitionsettingsmaxretries) + * 13.3 [RQ.ClickHouse.ExportPartition.ResumeAfterFailure](#rqclickhouseexportpartitionresumeafterfailure) + * 13.4 [RQ.ClickHouse.ExportPartition.PartialProgress](#rqclickhouseexportpartitionpartialprogress) + * 13.5 [RQ.ClickHouse.ExportPartition.Cleanup](#rqclickhouseexportpartitioncleanup) + * 13.6 [RQ.ClickHouse.ExportPartition.Settings.ManifestTTL](#rqclickhouseexportpartitionsettingsmanifestttl) +* 14 [Network resilience](#network-resilience) + * 14.1 [RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues](#rqclickhouseexportpartitionnetworkresiliencepacketissues) + * 14.2 [RQ.ClickHouse.ExportPartition.NetworkResilience.DestinationInterruption](#rqclickhouseexportpartitionnetworkresiliencedestinationinterruption) + * 14.3 [RQ.ClickHouse.ExportPartition.NetworkResilience.NodeInterruption](#rqclickhouseexportpartitionnetworkresiliencenodeinterruption) +* 15 [Export operation restrictions](#export-operation-restrictions) + * 15.1 [Preventing same table exports](#preventing-same-table-exports) + * 15.1.1 [RQ.ClickHouse.ExportPartition.Restrictions.SameTable](#rqclickhouseexportpartitionrestrictionssametable) + * 15.2 [Destination table compatibility](#destination-table-compatibility) + * 15.2.1 [RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport](#rqclickhouseexportpartitionrestrictionsdestinationsupport) + * 15.3 [Local table restriction](#local-table-restriction) + * 15.3.1 [RQ.ClickHouse.ExportPartition.Restrictions.LocalTable](#rqclickhouseexportpartitionrestrictionslocaltable) + * 15.4 [Partition key compatibility](#partition-key-compatibility) + * 15.4.1 [RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey](#rqclickhouseexportpartitionrestrictionspartitionkey) + * 15.5 [Source partition availability](#source-partition-availability) + * 15.5.1 [RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition](#rqclickhouseexportpartitionrestrictionssourcepartition) +* 16 [Export operation concurrency](#export-operation-concurrency) + * 16.1 [RQ.ClickHouse.ExportPartition.Concurrency](#rqclickhouseexportpartitionconcurrency) +* 17 [Export operation idempotency](#export-operation-idempotency) + * 17.1 [RQ.ClickHouse.ExportPartition.Idempotency](#rqclickhouseexportpartitionidempotency) + * 17.2 [RQ.ClickHouse.ExportPartition.Settings.ForceExport](#rqclickhouseexportpartitionsettingsforceexport) +* 18 [Export operation logging](#export-operation-logging) + * 18.1 [RQ.ClickHouse.ExportPartition.Logging](#rqclickhouseexportpartitionlogging) +* 19 [Monitoring export operations](#monitoring-export-operations) + * 19.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) +* 20 [Enabling export functionality](#enabling-export-functionality) + * 20.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) +* 21 [Handling file conflicts during export](#handling-file-conflicts-during-export) + * 21.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) +* 22 [Export operation configuration](#export-operation-configuration) + * 22.1 [RQ.ClickHouse.ExportPartition.ParallelFormatting](#rqclickhouseexportpartitionparallelformatting) +* 23 [Controlling export performance](#controlling-export-performance) + * 23.1 [RQ.ClickHouse.ExportPartition.ServerSettings.MaxBandwidth](#rqclickhouseexportpartitionserversettingsmaxbandwidth) + * 23.2 [RQ.ClickHouse.ExportPartition.ServerSettings.BackgroundMovePoolSize](#rqclickhouseexportpartitionserversettingsbackgroundmovepoolsize) + * 23.3 [RQ.ClickHouse.ExportPartition.Metrics.Export](#rqclickhouseexportpartitionmetricsexport) +* 24 [Export operation security](#export-operation-security) + * 24.1 [RQ.ClickHouse.ExportPartition.Security.RBAC](#rqclickhouseexportpartitionsecurityrbac) + * 24.2 [RQ.ClickHouse.ExportPartition.Security.DataEncryption](#rqclickhouseexportpartitionsecuritydataencryption) + * 24.3 [RQ.ClickHouse.ExportPartition.Security.Network](#rqclickhouseexportpartitionsecuritynetwork) + * 24.4 [RQ.ClickHouse.ExportPartition.Security.CredentialManagement](#rqclickhouseexportpartitionsecuritycredentialmanagement) ## Introduction @@ -1272,6 +1565,51 @@ This command allows users to export entire partitions in a single operation, which is more efficient than exporting individual parts and ensures all data for a partition is exported together. +### RQ.ClickHouse.ExportPartition.IntoOutfile +version: 1.0 + +[ClickHouse] SHALL support the usage of the `INTO OUTFILE` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +INTO OUTFILE '/path/to/file' +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +### RQ.ClickHouse.ExportPartition.Format +version: 1.0 + +[ClickHouse] SHALL support the usage of the `FORMAT` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +FORMAT JSON +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + +### RQ.ClickHouse.ExportPartition.SettingsClause +version: 1.0 + +[ClickHouse] SHALL support the usage of the `SETTINGS` clause with `EXPORT PARTITION` and SHALL not output any errors. + +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_max_retries = 5 +``` + ## Supported source table engines ### RQ.ClickHouse.ExportPartition.SourceEngines @@ -1302,6 +1640,18 @@ In a replicated cluster, different parts of the same partition may exist on different replicas. The system must coordinate exports across all replicas to ensure complete partition export without duplication. +### RQ.ClickHouse.ExportPartition.Shards +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from source tables that are on different shards than the destination table. + +### RQ.ClickHouse.ExportPartition.Versions +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from source tables that are stored on servers with different ClickHouse versions than the destination server. + +Users can export partitions from tables on servers with older ClickHouse versions to tables on servers with newer versions, enabling data migration and version upgrades. + ## Supported source part storage types ### RQ.ClickHouse.ExportPartition.SourcePartStorage @@ -1348,6 +1698,28 @@ Export partition is designed to move data from local or replicated storage to object storage systems for long-term storage, analytics, or data sharing purposes. +## Temporary tables + +### RQ.ClickHouse.ExportPartition.TemporaryTable +version: 1.0 + +[ClickHouse] SHALL support exporting partitions from temporary ReplicatedMergeTree tables to destination object storage tables. + +For example, + +```sql +CREATE TEMPORARY TABLE temp_table (p UInt64, k String, d UInt64) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/temp_table', '{replica}') +PARTITION BY p ORDER BY k; + +INSERT INTO temp_table VALUES (2020, 'key1', 100), (2020, 'key2', 200); + +ALTER TABLE temp_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ## Schema compatibility ### RQ.ClickHouse.ExportPartition.SchemaCompatibility @@ -1421,6 +1793,13 @@ Production systems often have partitions containing very large amounts of data, and the export must handle these efficiently without timeouts or memory issues. +### RQ.ClickHouse.ExportPartition.Corrupted +version: 1.0 + +[ClickHouse] SHALL output an error and prevent export operations from proceeding when trying to export a partition that contains corrupted parts in the source table. + +The system SHALL detect corruption in partitions containing compact parts, wide parts, or mixed part types. + ## Export operation failure handling ### RQ.ClickHouse.ExportPartition.RetryMechanism @@ -1437,6 +1816,16 @@ This setting allows users to control how many times the system will retry exporting a part before marking it as failed. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_max_retries = 5 +``` + ### RQ.ClickHouse.ExportPartition.ResumeAfterFailure version: 1.0 @@ -1447,6 +1836,15 @@ [ClickHouse] SHALL allow export operations to make partial progress, with successfully exported parts remaining in the destination even if other parts fail. Users SHALL be able to see which parts have been successfully exported and which parts have failed. +For example, users can query the export status to see partial progress: + +```sql +SELECT source_table, destination_table, partition_id, status, + parts_total, parts_processed, parts_failed +FROM system.replicated_partition_exports +WHERE partition_id = '2020' +``` + ### RQ.ClickHouse.ExportPartition.Cleanup version: 1.0 @@ -1459,6 +1857,16 @@ This setting only affects completed export operations and does not delete in-progress tasks. It allows users to control how long export history is maintained to prevent duplicate exports. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_manifest_ttl = 360 +``` + ## Network resilience ### RQ.ClickHouse.ExportPartition.NetworkResilience.PacketIssues @@ -1516,6 +1924,15 @@ Exporting to the same table would be redundant and could cause data duplication or conflicts. +For example, the following command SHALL output an error: + +```sql +ALTER TABLE my_table +EXPORT PARTITION ID '2020' +TO TABLE my_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Destination table compatibility #### RQ.ClickHouse.ExportPartition.Restrictions.DestinationSupport @@ -1542,6 +1959,15 @@ Export partition is designed to move data to object storage, not to local MergeTree tables. +For example, if `local_table` is a MergeTree table, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE local_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Partition key compatibility #### RQ.ClickHouse.ExportPartition.Restrictions.PartitionKey @@ -1554,6 +1980,15 @@ Matching partition keys ensure that exported data is organized correctly in the destination storage. +For example, if `source_table` is partitioned by `toYYYYMM(date)` and `destination_table` is partitioned by `toYYYYMMDD(date)`, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ### Source partition availability #### RQ.ClickHouse.ExportPartition.Restrictions.SourcePartition @@ -1567,6 +2002,15 @@ The system must verify that the partition exists and contains data before attempting to export it. +For example, if partition ID '2025' does not exist in `source_table`, the following command SHALL output an error: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2025' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 +``` + ## Export operation concurrency ### RQ.ClickHouse.ExportPartition.Concurrency @@ -1603,6 +2047,16 @@ When set to `true`, this setting allows users to overwrite existing export entries and force re-export of a partition, even if a previous export operation exists for the same partition and destination. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_partition_force_export = 1 +``` + ## Export operation logging ### RQ.ClickHouse.ExportPartition.Logging @@ -1621,6 +2075,15 @@ Detailed logging helps users monitor export progress, troubleshoot issues, and audit export operations. +For example, users can query export logs: + +```sql +SELECT event_time, event_type, table, partition, rows, bytes_read, bytes_written +FROM system.part_log +WHERE event_type = 'EXPORT_PARTITION' +ORDER BY event_time DESC +``` + ## Monitoring export operations ### RQ.ClickHouse.ExportPartition.SystemTables.Exports @@ -1641,6 +2104,15 @@ Users need visibility into export operations to monitor progress, identify issues, and understand export status across the cluster. +For example, + +```sql +SELECT source_table, destination_table, partition_id, status, + parts_total, parts_processed, parts_failed, create_time, update_time +FROM system.replicated_partition_exports +WHERE status = 'IN_PROGRESS' +``` + ## Enabling export functionality ### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental @@ -1659,6 +2131,16 @@ This setting allows users to control whether to overwrite existing data in the destination, providing safety by default while allowing overwrites when needed. +For example, + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1, + export_merge_tree_part_overwrite_file_if_exists = 1 +``` + ## Export operation configuration ### RQ.ClickHouse.ExportPartition.ParallelFormatting From 23759abdda2d2a108d2f16b5c13019563ac3fb92 Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 16:58:35 +0400 Subject: [PATCH 93/99] upd --- s3/requirements/export_partition.md | 14 +++++++++ s3/requirements/export_partition.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/s3/requirements/export_partition.md b/s3/requirements/export_partition.md index 0c2c2f64b..2ea943237 100644 --- a/s3/requirements/export_partition.md +++ b/s3/requirements/export_partition.md @@ -68,6 +68,7 @@ * 19.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) * 20 [Enabling export functionality](#enabling-export-functionality) * 20.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) + * 20.2 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled](#rqclickhouseexportpartitionsettingsallowexperimentaldisabled) * 21 [Handling file conflicts during export](#handling-file-conflicts-during-export) * 21.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) * 22 [Export operation configuration](#export-operation-configuration) @@ -681,6 +682,19 @@ version: 1.0 This setting allows administrators to control access to experimental functionality and ensures users are aware they are using a feature that may change. +### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled +version: 1.0 + +[ClickHouse] SHALL prevent export partition operations when `allow_experimental_export_merge_tree_part` is set to `0` (turned off). When the setting is `0`, attempting to execute `ALTER TABLE ... EXPORT PARTITION ID ...` commands SHALL result in an error indicating that the experimental feature is not enabled. + +For example, the following command SHALL output an error when the setting is `0`: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +``` + ## Handling file conflicts during export ### RQ.ClickHouse.ExportPartition.Settings.OverwriteFile diff --git a/s3/requirements/export_partition.py b/s3/requirements/export_partition.py index 8e218fbdb..1e37b51b1 100644 --- a/s3/requirements/export_partition.py +++ b/s3/requirements/export_partition.py @@ -995,6 +995,30 @@ num="20.1", ) +RQ_ClickHouse_ExportPartition_Settings_AllowExperimental_Disabled = Requirement( + name="RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled", + version="1.0", + priority=None, + group=None, + type=None, + uid=None, + description=( + "[ClickHouse] SHALL prevent export partition operations when `allow_experimental_export_merge_tree_part` is set to `0` (turned off). When the setting is `0`, attempting to execute `ALTER TABLE ... EXPORT PARTITION ID ...` commands SHALL result in an error indicating that the experimental feature is not enabled.\n" + "\n" + "For example, the following command SHALL output an error when the setting is `0`:\n" + "\n" + "```sql\n" + "ALTER TABLE source_table \n" + "EXPORT PARTITION ID '2020' \n" + "TO TABLE destination_table\n" + "```\n" + "\n" + ), + link=None, + level=2, + num="20.2", +) + RQ_ClickHouse_ExportPartition_Settings_OverwriteFile = Requirement( name="RQ.ClickHouse.ExportPartition.Settings.OverwriteFile", version="1.0", @@ -1345,6 +1369,11 @@ level=2, num="20.1", ), + Heading( + name="RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled", + level=2, + num="20.2", + ), Heading(name="Handling file conflicts during export", level=1, num="21"), Heading( name="RQ.ClickHouse.ExportPartition.Settings.OverwriteFile", @@ -1428,6 +1457,7 @@ RQ_ClickHouse_ExportPartition_Logging, RQ_ClickHouse_ExportPartition_SystemTables_Exports, RQ_ClickHouse_ExportPartition_Settings_AllowExperimental, + RQ_ClickHouse_ExportPartition_Settings_AllowExperimental_Disabled, RQ_ClickHouse_ExportPartition_Settings_OverwriteFile, RQ_ClickHouse_ExportPartition_ParallelFormatting, RQ_ClickHouse_ExportPartition_ServerSettings_MaxBandwidth, @@ -1509,6 +1539,7 @@ * 19.1 [RQ.ClickHouse.ExportPartition.SystemTables.Exports](#rqclickhouseexportpartitionsystemtablesexports) * 20 [Enabling export functionality](#enabling-export-functionality) * 20.1 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental](#rqclickhouseexportpartitionsettingsallowexperimental) + * 20.2 [RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled](#rqclickhouseexportpartitionsettingsallowexperimentaldisabled) * 21 [Handling file conflicts during export](#handling-file-conflicts-during-export) * 21.1 [RQ.ClickHouse.ExportPartition.Settings.OverwriteFile](#rqclickhouseexportpartitionsettingsoverwritefile) * 22 [Export operation configuration](#export-operation-configuration) @@ -2122,6 +2153,19 @@ This setting allows administrators to control access to experimental functionality and ensures users are aware they are using a feature that may change. +### RQ.ClickHouse.ExportPartition.Settings.AllowExperimental.Disabled +version: 1.0 + +[ClickHouse] SHALL prevent export partition operations when `allow_experimental_export_merge_tree_part` is set to `0` (turned off). When the setting is `0`, attempting to execute `ALTER TABLE ... EXPORT PARTITION ID ...` commands SHALL result in an error indicating that the experimental feature is not enabled. + +For example, the following command SHALL output an error when the setting is `0`: + +```sql +ALTER TABLE source_table +EXPORT PARTITION ID '2020' +TO TABLE destination_table +``` + ## Handling file conflicts during export ### RQ.ClickHouse.ExportPartition.Settings.OverwriteFile From a921386e33c31019c47aed5747b8e5baed1256b0 Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 17:37:23 +0400 Subject: [PATCH 94/99] small update for steps --- s3/tests/export_partition/steps.py | 242 ++++++++++++++++++++++++----- 1 file changed, 204 insertions(+), 38 deletions(-) diff --git a/s3/tests/export_partition/steps.py b/s3/tests/export_partition/steps.py index 2e3abb8b2..f4c7c697c 100644 --- a/s3/tests/export_partition/steps.py +++ b/s3/tests/export_partition/steps.py @@ -1,10 +1,67 @@ +import json + from testflows.core import * from testflows.asserts import error from helpers.common import getuid from helpers.create import * from helpers.queries import * -from s3.tests.common import temporary_bucket_path -import json +from s3.tests.common import temporary_bucket_path, s3_storage + + +@TestStep(Given) +def minio_storage_configuration(self, restart=True): + """Create storage configuration with jbod disks, MinIO S3 disk, and tiered storage policy.""" + with Given( + "I configure storage with jbod disks, MinIO S3 disk, and tiered storage" + ): + disks = { + "jbod1": {"path": "/jbod1/"}, + "jbod2": {"path": "/jbod2/"}, + "jbod3": {"path": "/jbod3/"}, + "jbod4": {"path": "/jbod4/"}, + "external": {"path": "/external/"}, + "external2": {"path": "/external2/"}, + "minio": { + "type": "s3", + "endpoint": "http://minio1:9001/root/data/", + "access_key_id": "minio_user", + "secret_access_key": "minio123", + }, + "s3_cache": { + "type": "cache", + "disk": "minio", + "path": "minio_cache/", + "max_size": "22548578304", + "cache_on_write_operations": "1", + }, + } + + policies = { + "jbod1": {"volumes": {"main": {"disk": "jbod1"}}}, + "jbod2": {"volumes": {"main": {"disk": "jbod2"}}}, + "jbod3": {"volumes": {"main": {"disk": "jbod3"}}}, + "jbod4": {"volumes": {"main": {"disk": "jbod4"}}}, + "external": {"volumes": {"main": {"disk": "external"}}}, + "external2": {"volumes": {"main": {"disk": "external2"}}}, + "tiered_storage": { + "volumes": { + "hot": [ + {"disk": "jbod1"}, + {"disk": "jbod2"}, + {"max_data_part_size_bytes": "2048"}, + ], + "cold": [ + {"disk": "external"}, + {"disk": "external2"}, + ], + }, + "move_factor": "0.7", + }, + "s3_cache": {"volumes": {"external": {"disk": "s3_cache"}}}, + "minio_external_nocache": {"volumes": {"external": {"disk": "minio"}}}, + } + + s3_storage(disks=disks, policies=policies, restart=restart) def default_columns(simple=True, partition_key_type="UInt8"): @@ -94,6 +151,67 @@ def create_s3_table( return table_name +@TestStep(When) +def kill_minio(self, cluster=None, container_name="s3_env-minio1-1", signal="KILL"): + """Forcefully kill MinIO container to simulate network crash.""" + + if cluster is None: + cluster = self.context.cluster + + retry(cluster.command, 5)( + None, + f"docker kill --signal={signal} {container_name}", + timeout=60, + exitcode=0, + steps=False, + ) + + if signal == "TERM": + with And("Waiting for MinIO container to stop"): + for attempt in retries(timeout=30, delay=1): + with attempt: + result = cluster.command( + None, + f"docker ps --filter name={container_name} --format '{{{{.Names}}}}'", + timeout=10, + steps=False, + no_checks=True, + ) + if container_name not in result.output: + break + fail("MinIO container still running") + + +@TestStep(When) +def start_minio(self, cluster=None, container_name="s3_env-minio1-1"): + """Start MinIO container and wait for it to be ready.""" + + if cluster is None: + cluster = self.context.cluster + + with By("Starting MinIO container"): + retry(cluster.command, 5)( + None, + f"docker start {container_name}", + timeout=60, + exitcode=0, + steps=True, + ) + + with And("Waiting for MinIO to be ready"): + for attempt in retries(timeout=30, delay=1): + with attempt: + result = cluster.command( + None, + f"docker exec {container_name} curl -f http://localhost:9001/minio/health/live", + timeout=10, + steps=False, + no_checks=True, + ) + if result.exitcode != 0: + fail("MinIO health check failed") + + @TestStep(When) def get_parts(self, table_name, node): """Get all parts for a table on a given node.""" @@ -103,57 +221,43 @@ def get_parts(self, table_name, node): exitcode=0, steps=True, ).output - return [row.strip() for row in output.splitlines()] + return sorted([row.strip() for row in output.splitlines()]) @TestStep(When) -def export_parts( +def export_partitions( self, source_table, destination_table, node, parts=None, exitcode=0, - explicit_set=1, + settings=None, + inline_settings=True, ): - """Export parts from a source table to a destination table on the same node. If parts are not provided, all parts will be exported.""" + """Export partitions from a source table to a destination table on the same node. If partitions are not provided, all partitions will be exported.""" if parts is None: parts = get_parts(table_name=source_table, node=node) + if inline_settings is True: + inline_settings = self.context.default_settings + no_checks = exitcode != 0 output = [] for part in parts: - if explicit_set == 1: - output.append( - node.query( - f"SET allow_experimental_export_merge_tree_part = 1; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) - ) - elif explicit_set == 0: - output.append( - node.query( - f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - settings=[("allow_experimental_export_merge_tree_part", 1)], - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) - ) - elif explicit_set == -1: - output.append( - node.query( - f"SET allow_experimental_export_merge_tree_part = 0; ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", - exitcode=exitcode, - no_checks=no_checks, - steps=True, - ) + output.append( + node.query( + f"ALTER TABLE {source_table} EXPORT PART '{part}' TO TABLE {destination_table}", + exitcode=exitcode, + no_checks=no_checks, + steps=True, + settings=settings, + inline_settings=inline_settings, ) + ) return output @@ -173,16 +277,40 @@ def get_export_events(self, node): event = json.loads(line) events[event["name"]] = int(event["value"]) + if "PartsExportFailures" not in events: + events["PartsExportFailurget_export_eventses"] = 0 + if "PartsExports" not in events: + events["PartsExports"] = 0 + if "PartsExportDuplicated" not in events: + events["PartsExportDuplicated"] = 0 + return events @TestStep(When) -def drop_column(self, node, table_name, column_name): - """Drop a column from a table.""" +def get_part_log(self, node): + """Get the part log from the system.part_log table of a given node.""" + + output = node.query( + "SELECT part_name FROM system.part_log WHERE event_type = 'ExportPart'", + exitcode=0, + steps=True, + ).output.splitlines() + + return output - node.query( - f"ALTER TABLE {table_name} DROP COLUMN {column_name}", exitcode=0, steps=True - ) + +@TestStep(When) +def get_system_exports(self, node): + """Get the system.exports source and destination table columns for all ongoing exports.""" + + exports = node.query( + "SELECT source_table, destination_table FROM system.exports", + exitcode=0, + steps=True, + ).output.splitlines() + + return [line.strip().split("\t") for line in exports] @TestStep(Then) @@ -201,3 +329,41 @@ def source_matches_destination( table_name=destination_table, node=destination_node ) assert source_data == destination_data, error() + + +@TestStep(Then) +def verify_export_concurrency(self, node, source_tables): + """Verify exget_export_eventsports from different tables ran concurrently by checking overlapping execution times. + + Checks that for each table, there's at least one pair of consecutive exports from that table + with an export from another table in between, confirming concurrent execution. + """ + + table_filter = " OR ".join([f"table = '{table}'" for table in source_tables]) + + query = f""" + SELECT + table + FROM system.part_log + WHERE event_type = 'ExportPart' + AND ({table_filter}) + ORDER BY event_time_microseconds + """ + + result = node.query(query, exitcode=0, steps=True) + + exports = [line for line in result.output.strip().splitlines()] + + tables_done = set() + + for i in range(len(exports) - 1): + current_table = exports[i] + next_table = exports[i + 1] + + if current_table != next_table and current_table not in tables_done: + for j in range(i + 2, len(exports)): + if exports[j] == current_table: + tables_done.add(current_table) + break + + assert len(tables_done) == len(source_tables), error() From 57178ddabbe1de61e0b2d515bbd39c4f38b9b350 Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 17:39:27 +0400 Subject: [PATCH 95/99] add specification --- s3/tests/export_partition/feature.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/s3/tests/export_partition/feature.py b/s3/tests/export_partition/feature.py index fb96dc454..b04d2bcf7 100644 --- a/s3/tests/export_partition/feature.py +++ b/s3/tests/export_partition/feature.py @@ -1,8 +1,9 @@ from testflows.core import * +from s3.requirements.export_partition import * @TestFeature -@Specifications() +@Specifications(SRS_016_ClickHouse_Export_Partition_to_S3) @Requirements() @Name("export partition") def minio(self, uri, bucket_prefix): @@ -12,9 +13,9 @@ def minio(self, uri, bucket_prefix): self.context.bucket_prefix = bucket_prefix Feature(run=load("s3.tests.export_partition.sanity", "feature")) - # Feature(run=load("s3.tests.export_part.error_handling", "feature")) - # Feature(run=load("s3.tests.export_part.clusters_nodes", "feature")) - # Feature(run=load("s3.tests.export_part.engines_volumes", "feature")) - # Feature(run=load("s3.tests.export_part.datatypes", "feature")) - # Feature(run=load("s3.tests.export_part.concurrency_networks", "feature")) - # Feature(run=load("s3.tests.export_part.system_monitoring", "feature")) + Feature(run=load("s3.tests.export_partition.error_handling", "feature")) + Feature(run=load("s3.tests.export_partition.clusters_nodes", "feature")) + Feature(run=load("s3.tests.export_partition.engines_volumes", "feature")) + Feature(run=load("s3.tests.export_partition.datatypes", "feature")) + Feature(run=load("s3.tests.export_partition.concurrency_networks", "feature")) + Feature(run=load("s3.tests.export_partition.system_monitoring", "feature")) From d0881f51147251ca4ed19b293d9c7020cdcd2240 Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 17:44:37 +0400 Subject: [PATCH 96/99] fix sanity --- s3/tests/export_partition/feature.py | 1 + s3/tests/export_partition/sanity.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/s3/tests/export_partition/feature.py b/s3/tests/export_partition/feature.py index b04d2bcf7..60053f5f8 100644 --- a/s3/tests/export_partition/feature.py +++ b/s3/tests/export_partition/feature.py @@ -11,6 +11,7 @@ def minio(self, uri, bucket_prefix): self.context.uri_base = uri self.context.bucket_prefix = bucket_prefix + self.context.default_settings = [("allow_experimental_export_merge_tree_part", 1)] Feature(run=load("s3.tests.export_partition.sanity", "feature")) Feature(run=load("s3.tests.export_partition.error_handling", "feature")) diff --git a/s3/tests/export_partition/sanity.py b/s3/tests/export_partition/sanity.py index 98414e622..3bba982c2 100644 --- a/s3/tests/export_partition/sanity.py +++ b/s3/tests/export_partition/sanity.py @@ -8,6 +8,7 @@ from alter.table.replace_partition.partition_types import ( table_with_compact_and_wide_parts, ) +from s3.tests.export_partition.steps import export_partitions @TestScenario @@ -234,11 +235,11 @@ def large_export(self): def feature(self): """Check basic functionality of exporting data parts to S3 storage.""" - # Scenario(run=empty_table) + Scenario(run=empty_table) Scenario(run=basic_table) - # Scenario(run=no_partition_by) - # Scenario(run=mismatched_columns) - # Scenario(run=wide_and_compact_parts) + Scenario(run=no_partition_by) + Scenario(run=mismatched_columns) + Scenario(run=wide_and_compact_parts) # if self.context.stress: # Scenario(run=large_export) # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting From df4570b01979bdb2bfbd5529d0f8954514f573d3 Mon Sep 17 00:00:00 2001 From: selfeer Date: Mon, 10 Nov 2025 17:48:08 +0400 Subject: [PATCH 97/99] fix sanity 2 --- s3/tests/export_partition/sanity.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/s3/tests/export_partition/sanity.py b/s3/tests/export_partition/sanity.py index 3bba982c2..3463bdd8d 100644 --- a/s3/tests/export_partition/sanity.py +++ b/s3/tests/export_partition/sanity.py @@ -240,6 +240,5 @@ def feature(self): Scenario(run=no_partition_by) Scenario(run=mismatched_columns) Scenario(run=wide_and_compact_parts) - # if self.context.stress: - # Scenario(run=large_export) - # Scenario(run=export_setting) # This test fails because of an actual bug in the export setting + if self.context.stress: + Scenario(run=large_export) From e23ed61cbcdd2d851470843a75835efa6e5f4d91 Mon Sep 17 00:00:00 2001 From: Julian Huang Date: Mon, 10 Nov 2025 09:46:57 -0500 Subject: [PATCH 98/99] Network rate limit for consistency in system exports --- s3/tests/export_part/system_monitoring.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/s3/tests/export_part/system_monitoring.py b/s3/tests/export_part/system_monitoring.py index 49054c420..cfbd555bf 100644 --- a/s3/tests/export_part/system_monitoring.py +++ b/s3/tests/export_part/system_monitoring.py @@ -1,8 +1,11 @@ +from time import sleep + from testflows.core import * from testflows.asserts import error from s3.tests.export_part.steps import * from s3.requirements.export_part import * -from time import sleep +from alter.stress.tests.tc_netem import * + @TestScenario @@ -104,6 +107,9 @@ def system_exports_logging(self): number_of_values=1000000, ) s3_table_name = create_s3_table(table_name="s3", create_new_bucket=True) + + with And("I slow down the network speed"): + network_packet_rate_limit(node=self.context.node, rate_mbit=250) with When("I export parts to the S3 table"): export_parts( From 98acba4ae0fbc9e044fe524036ccf0bc62342dc7 Mon Sep 17 00:00:00 2001 From: Selfeer Date: Tue, 11 Nov 2025 16:55:30 +0400 Subject: [PATCH 99/99] update skips --- s3/regression.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/s3/regression.py b/s3/regression.py index 274a168dd..049f8cb7f 100755 --- a/s3/regression.py +++ b/s3/regression.py @@ -501,6 +501,16 @@ "doesn't work <22.8", check_clickhouse_version("<22.8"), ), + "/:/:/part 3/export part/*": ( + Skip, + "Export part introduced in Antalya build", + check_if_not_antalya_build, + ), + "/:/:/part 3/export partition/*": ( + Skip, + "Export partition introduced in Antalya build", + check_if_not_antalya_build, + ), } @@ -552,9 +562,9 @@ def minio_regression( for node in nodes["clickhouse"]: experimental_analyzer(node=cluster.node(node), with_analyzer=with_analyzer) - with And("I install tc-netem on all clickhouse nodes"): - for node in self.context.nodes: - node.command("apt install --yes iproute2 procps") + # with And("I install tc-netem on all clickhouse nodes"): + # for node in self.context.nodes: + # node.command("apt install --yes iproute2 procps") with And("allow higher cpu_wait_ratio "): if check_clickhouse_version(">=25.4")(self):