Skip to content

Commit 9666347

Browse files
authored
Fix MSSQLToGCSOperator MSSQL BIT data type conversion to Parquet boolean (#57514)
* Fix MSSQLToGCSOperator MSSQL BIT data type conversion to Parquet boolean Changes type_map from "BOOLEAN" to "BOOL" to match the expected schema type in BaseSQLToGCSOperator._convert_parquet_schema(). This fixes issue #57461 where exporting MSSQL bit fields to Parquet format would raise ArrowTypeError: Expected bytes, got a 'bool' object. * Add a test case to verify that MSSQL BIT fields are mapped to boolean in the parquet schema
1 parent 51e2c88 commit 9666347

File tree

2 files changed

+55
-4
lines changed

2 files changed

+55
-4
lines changed

providers/google/src/airflow/providers/google/cloud/transfers/mssql_to_gcs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class MSSQLToGCSOperator(BaseSQLToGCSOperator):
6767

6868
ui_color = "#e0a98c"
6969

70-
type_map = {2: "BOOLEAN", 3: "INTEGER", 4: "TIMESTAMP", 5: "NUMERIC"}
70+
type_map = {2: "BOOL", 3: "INTEGER", 4: "TIMESTAMP", 5: "NUMERIC"}
7171

7272
def __init__(
7373
self,

providers/google/tests/unit/google/cloud/transfers/test_mssql_to_gcs.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
SQL = "select 1"
3636
BUCKET = "gs://test"
3737
JSON_FILENAME = "test_{}.ndjson"
38+
PARQUET_FILENAME = "test_{}.parquet"
3839
GZIP = False
3940

4041
ROWS = [
@@ -57,14 +58,14 @@
5758
SCHEMA_JSON = [
5859
b'[{"mode": "NULLABLE", "name": "some_str", "type": "STRING"}, ',
5960
b'{"mode": "NULLABLE", "name": "some_num", "type": "INTEGER"}, ',
60-
b'{"mode": "NULLABLE", "name": "some_binary", "type": "BOOLEAN"}, ',
61-
b'{"mode": "NULLABLE", "name": "some_bit", "type": "BOOLEAN"}]',
61+
b'{"mode": "NULLABLE", "name": "some_binary", "type": "BOOL"}, ',
62+
b'{"mode": "NULLABLE", "name": "some_bit", "type": "BOOL"}]',
6263
]
6364

6465
SCHEMA_JSON_BIT_FIELDS = [
6566
b'[{"mode": "NULLABLE", "name": "some_str", "type": "STRING"}, ',
6667
b'{"mode": "NULLABLE", "name": "some_num", "type": "INTEGER"}, ',
67-
b'{"mode": "NULLABLE", "name": "some_binary", "type": "BOOLEAN"}, ',
68+
b'{"mode": "NULLABLE", "name": "some_binary", "type": "BOOL"}, ',
6869
b'{"mode": "NULLABLE", "name": "some_bit", "type": "INTEGER"}]',
6970
]
7071

@@ -254,3 +255,53 @@ def db_hook(self):
254255
assert len(lineage.job_facets) == 1
255256
assert lineage.job_facets["sql"].query == sql
256257
assert lineage.run_facets == {}
258+
259+
@mock.patch("airflow.providers.google.cloud.transfers.mssql_to_gcs.MsSqlHook")
260+
@mock.patch("airflow.providers.google.cloud.transfers.sql_to_gcs.GCSHook")
261+
def test_bit_to_boolean_field_conversion(self, gcs_hook_mock_class, mssql_hook_mock_class):
262+
"""Test successful run of execute function for Parquet format with boolean fields.
263+
264+
This test verifies that MSSQL tables with columns of type "BIT" can exported
265+
using the bit_fields parameter, resulting in boolean fields in the Parquet file.
266+
"""
267+
import pyarrow
268+
269+
op = MSSQLToGCSOperator(
270+
task_id=TASK_ID,
271+
mssql_conn_id=MSSQL_CONN_ID,
272+
sql=SQL,
273+
bucket=BUCKET,
274+
filename=PARQUET_FILENAME,
275+
export_format="parquet",
276+
bit_fields=["some_binary", "some_bit"],
277+
)
278+
279+
mssql_hook_mock = mssql_hook_mock_class.return_value
280+
mssql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS)
281+
mssql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION
282+
283+
gcs_hook_mock = gcs_hook_mock_class.return_value
284+
285+
upload_called = False
286+
287+
def _assert_upload(bucket, obj, tmp_filename, mime_type=None, gzip=False, metadata=None):
288+
nonlocal upload_called
289+
upload_called = True
290+
assert bucket == BUCKET
291+
assert obj == PARQUET_FILENAME.format(0)
292+
assert mime_type == "application/octet-stream"
293+
assert gzip == GZIP
294+
295+
parquet_file = pyarrow.parquet.ParquetFile(tmp_filename)
296+
schema = parquet_file.schema_arrow
297+
# Verify that bit fields are mapped to boolean type in parquet schema
298+
assert schema.field("some_binary").type.equals(pyarrow.bool_())
299+
assert schema.field("some_bit").type.equals(pyarrow.bool_())
300+
301+
gcs_hook_mock.upload.side_effect = _assert_upload
302+
303+
op.execute(None)
304+
305+
assert upload_called, "Expected upload to be called"
306+
mssql_hook_mock_class.assert_called_once_with(mssql_conn_id=MSSQL_CONN_ID)
307+
mssql_hook_mock.get_conn().cursor().execute.assert_called_once_with(SQL)

0 commit comments

Comments
 (0)