66from datetime import datetime
77
88import sqlalchemy
9+ import sqlalchemy as sa
910from crate .client .sqlalchemy .types import ObjectType , ObjectTypeImpl , _ObjectArray
1011from singer_sdk import typing as th
11- from sqlalchemy . dialects . postgresql import ARRAY , BIGINT
12+ from singer_sdk . helpers . _typing import is_array_type , is_boolean_type , is_integer_type , is_number_type , is_object_type
1213from sqlalchemy .types import (
14+ ARRAY ,
15+ BIGINT ,
1316 BOOLEAN ,
1417 DATE ,
1518 DATETIME ,
1619 DECIMAL ,
20+ FLOAT ,
1721 INTEGER ,
1822 TEXT ,
1923 TIME ,
2226)
2327from target_postgres .connector import NOTYPE , PostgresConnector
2428
25- from target_cratedb .patch import polyfill_refresh_after_dml_engine
29+ from target_cratedb .sqlalchemy .patch import polyfill_refresh_after_dml_engine
30+ from target_cratedb .sqlalchemy .vector import FloatVector
2631
2732
2833class CrateDBConnector (PostgresConnector ):
@@ -111,8 +116,52 @@ def pick_individual_type(jsonschema_type: dict):
111116 if "object" in jsonschema_type ["type" ]:
112117 return ObjectType
113118 if "array" in jsonschema_type ["type" ]:
114- # TODO: Handle other inner-types as well?
119+ # Select between different kinds of `ARRAY` data types.
120+ #
121+ # This currently leverages an unspecified definition for the Singer SCHEMA,
122+ # using the `additionalProperties` attribute to convey additional type
123+ # information, agnostic of the target database.
124+ #
125+ # In this case, it is about telling different kinds of `ARRAY` types apart:
126+ # Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
127+ # alternatively, it can be a "vector" kind `ARRAY` of floating point
128+ # numbers, effectively what pgvector is storing in its `VECTOR` type.
129+ #
130+ # Still, `type: "vector"` is only a surrogate label here, because other
131+ # database systems may use different types for implementing the same thing,
132+ # and need to translate accordingly.
133+ """
134+ Schema override rule in `meltano.yml`:
135+
136+ type: "array"
137+ items:
138+ type: "number"
139+ additionalProperties:
140+ storage:
141+ type: "vector"
142+ dim: 4
143+
144+ Produced schema annotation in `catalog.json`:
145+
146+ {"type": "array",
147+ "items": {"type": "number"},
148+ "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
149+ """
150+ if "additionalProperties" in jsonschema_type and "storage" in jsonschema_type ["additionalProperties" ]:
151+ storage_properties = jsonschema_type ["additionalProperties" ]["storage" ]
152+ if "type" in storage_properties and storage_properties ["type" ] == "vector" :
153+ # On PostgreSQL/pgvector, use the corresponding type definition
154+ # from its SQLAlchemy dialect.
155+ return FloatVector (storage_properties ["dim" ])
156+
157+ # Discover/translate inner types.
158+ inner_type = resolve_array_inner_type (jsonschema_type )
159+ if inner_type is not None :
160+ return ARRAY (inner_type )
161+
162+ # When type discovery fails, assume `TEXT`.
115163 return ARRAY (TEXT ())
164+
116165 if jsonschema_type .get ("format" ) == "date-time" :
117166 return TIMESTAMP ()
118167 individual_type = th .to_sql_type (jsonschema_type )
@@ -139,20 +188,18 @@ def pick_best_sql_type(sql_type_array: list):
139188 DATE ,
140189 TIME ,
141190 DECIMAL ,
191+ FLOAT ,
142192 BIGINT ,
143193 INTEGER ,
144194 BOOLEAN ,
145195 NOTYPE ,
146196 ARRAY ,
147- ObjectType ,
197+ FloatVector ,
198+ ObjectTypeImpl ,
148199 ]
149200
150201 for sql_type in precedence_order :
151202 for obj in sql_type_array :
152- # FIXME: Workaround. Currently, ObjectType can not be resolved back to a type?
153- # TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union
154- if isinstance (sql_type , ObjectTypeImpl ):
155- return ObjectType
156203 if isinstance (obj , sql_type ):
157204 return obj
158205 return TEXT ()
@@ -188,6 +235,8 @@ def _get_type_sort_key(
188235
189236 if isinstance (sql_type , _ObjectArray ):
190237 return 0 , _len
238+ if isinstance (sql_type , FloatVector ):
239+ return 0 , _len
191240 if isinstance (sql_type , NOTYPE ):
192241 return 0 , _len
193242
@@ -245,3 +294,18 @@ def prepare_schema(self, schema_name: str) -> None:
245294 Don't emit `CREATE SCHEMA` statements to CrateDB.
246295 """
247296 pass
297+
298+
299+ def resolve_array_inner_type (jsonschema_type : dict ) -> t .Union [sa .types .TypeEngine , None ]:
300+ if "items" in jsonschema_type :
301+ if is_boolean_type (jsonschema_type ["items" ]):
302+ return BOOLEAN ()
303+ if is_number_type (jsonschema_type ["items" ]):
304+ return FLOAT ()
305+ if is_integer_type (jsonschema_type ["items" ]):
306+ return BIGINT ()
307+ if is_object_type (jsonschema_type ["items" ]):
308+ return ObjectType ()
309+ if is_array_type (jsonschema_type ["items" ]):
310+ return resolve_array_inner_type (jsonschema_type ["items" ]["type" ])
311+ return None
0 commit comments