Skip to content

Commit 6aac778

Browse files
yingjianwu98Yingjian Wu
andauthored
use TypeInfo to serialize to MetacatType instead of ObjectInspector (#585)
Co-authored-by: Yingjian Wu <[email protected]>
1 parent 45f38e0 commit 6aac778

File tree

4 files changed

+333
-89
lines changed

4 files changed

+333
-89
lines changed

metacat-connector-hive/src/main/java/com/netflix/metacat/connector/hive/converters/HiveTypeConverter.java

Lines changed: 40 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,16 @@
3131
import com.netflix.metacat.common.type.VarcharType;
3232
import lombok.extern.slf4j.Slf4j;
3333
import org.apache.hadoop.hive.serde.serdeConstants;
34-
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
35-
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
36-
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
37-
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
3834
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
39-
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
40-
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
41-
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
42-
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
43-
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
44-
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
4535
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
46-
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
36+
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
37+
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
38+
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
39+
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
40+
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
41+
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
4742
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
43+
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
4844
import org.apache.iceberg.PartitionField;
4945
import org.apache.iceberg.Schema;
5046
import org.apache.iceberg.types.Types;
@@ -78,24 +74,20 @@ public class HiveTypeConverter implements ConnectorTypeConverter {
7874
private static final Pattern DECIMAL_TYPE
7975
= Pattern.compile(DECIMAL_WITH_SCALE + "|" + DECIMAL_WITH_SCALE_AND_PRECISION, Pattern.CASE_INSENSITIVE);
8076

81-
private static Type getPrimitiveType(final ObjectInspector fieldInspector) {
82-
final PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) fieldInspector)
83-
.getPrimitiveCategory();
77+
private static Type getPrimitiveType(final TypeInfo typeInfo) {
78+
final PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory();
8479
if (HiveTypeMapping.getHIVE_TO_CANONICAL().containsKey(primitiveCategory.name())) {
8580
return HiveTypeMapping.getHIVE_TO_CANONICAL().get(primitiveCategory.name());
8681
}
8782
switch (primitiveCategory) {
8883
case DECIMAL:
89-
final DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) ((PrimitiveObjectInspector) fieldInspector)
90-
.getTypeInfo();
84+
final DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;
9185
return DecimalType.createDecimalType(decimalTypeInfo.precision(), decimalTypeInfo.getScale());
9286
case CHAR:
93-
final int cLength = ((CharTypeInfo) ((PrimitiveObjectInspector)
94-
fieldInspector).getTypeInfo()).getLength();
87+
final int cLength = ((CharTypeInfo) typeInfo).getLength();
9588
return CharType.createCharType(cLength);
9689
case VARCHAR:
97-
final int vLength = ((VarcharTypeInfo) ((PrimitiveObjectInspector) fieldInspector)
98-
.getTypeInfo()).getLength();
90+
final int vLength = ((VarcharTypeInfo) typeInfo).getLength();
9991
return VarcharType.createVarcharType(vLength);
10092
default:
10193
return null;
@@ -106,17 +98,7 @@ private static Type getPrimitiveType(final ObjectInspector fieldInspector) {
10698
public Type toMetacatType(final String type) {
10799
// Hack to fix presto "varchar" type coming in with no length which is required by Hive.
108100
final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(sanitizeType(type));
109-
ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
110-
// The standard struct object inspector forces field names to lower case, however in Metacat we need to preserve
111-
// the original case of the struct fields so we wrap it with our wrapper to force the fieldNames to keep
112-
// their original case
113-
if (typeInfo.getCategory().equals(ObjectInspector.Category.STRUCT)) {
114-
final StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
115-
final StandardStructObjectInspector objectInspector = (StandardStructObjectInspector) oi;
116-
oi = new HiveTypeConverter.SameCaseStandardStructObjectInspector(
117-
structTypeInfo.getAllStructFieldNames(), objectInspector);
118-
}
119-
return getCanonicalType(oi);
101+
return getCanonicalType(typeInfo);
120102
}
121103

122104
/**
@@ -305,43 +287,48 @@ public static String sanitizeType(final String type) {
305287
/**
306288
* Returns the canonical type.
307289
*
308-
* @param fieldInspector inspector
309-
* @return type
290+
* @param typeInfo typeInfo
291+
* @return Metacat Type
310292
*/
311-
Type getCanonicalType(final ObjectInspector fieldInspector) {
312-
switch (fieldInspector.getCategory()) {
293+
Type getCanonicalType(final TypeInfo typeInfo) {
294+
switch (typeInfo.getCategory()) {
313295
case PRIMITIVE:
314-
return getPrimitiveType(fieldInspector);
296+
return getPrimitiveType(typeInfo);
315297
case MAP:
316-
final MapObjectInspector mapObjectInspector =
317-
TypeUtils.checkType(fieldInspector, MapObjectInspector.class,
318-
"fieldInspector");
319-
final Type keyType = getCanonicalType(mapObjectInspector.getMapKeyObjectInspector());
320-
final Type valueType = getCanonicalType(mapObjectInspector.getMapValueObjectInspector());
298+
final MapTypeInfo mapTypeInfo =
299+
TypeUtils.checkType(typeInfo, MapTypeInfo.class, "typeInfo");
300+
final Type keyType = getCanonicalType(mapTypeInfo.getMapKeyTypeInfo());
301+
final Type valueType = getCanonicalType(mapTypeInfo.getMapValueTypeInfo());
321302
if (keyType == null || valueType == null) {
322303
return null;
323304
}
324305
return TypeRegistry.getTypeRegistry().getParameterizedType(TypeEnum.MAP,
325306
ImmutableList.of(keyType.getTypeSignature(), valueType.getTypeSignature()), ImmutableList.of());
326307
case LIST:
327-
final ListObjectInspector listObjectInspector =
328-
TypeUtils.checkType(fieldInspector, ListObjectInspector.class,
329-
"fieldInspector");
308+
final ListTypeInfo listTypeInfo =
309+
TypeUtils.checkType(typeInfo, ListTypeInfo.class, "typeInfo");
330310
final Type elementType =
331-
getCanonicalType(listObjectInspector.getListElementObjectInspector());
311+
getCanonicalType(listTypeInfo.getListElementTypeInfo());
332312
if (elementType == null) {
333313
return null;
334314
}
335315
return TypeRegistry.getTypeRegistry().getParameterizedType(TypeEnum.ARRAY,
336316
ImmutableList.of(elementType.getTypeSignature()), ImmutableList.of());
337317
case STRUCT:
338-
final StructObjectInspector structObjectInspector =
339-
TypeUtils.checkType(fieldInspector, StructObjectInspector.class, "fieldInspector");
340-
final List<TypeSignature> fieldTypes = new ArrayList<>();
341-
final List<Object> fieldNames = new ArrayList<>();
342-
for (StructField field : structObjectInspector.getAllStructFieldRefs()) {
343-
fieldNames.add(field.getFieldName());
344-
final Type fieldType = getCanonicalType(field.getFieldObjectInspector());
318+
final StructTypeInfo structTypeInfo =
319+
TypeUtils.checkType(typeInfo, StructTypeInfo.class, "typeInfo");
320+
// Hive struct type infos
321+
final List<String> structFieldNames = structTypeInfo.getAllStructFieldNames();
322+
final List<TypeInfo> structFieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
323+
final int structInfoCounts = structFieldNames.size();
324+
325+
// Metacat canonical type infos
326+
final List<TypeSignature> fieldTypes = new ArrayList<>(structInfoCounts);
327+
final List<Object> fieldNames = new ArrayList<>(structInfoCounts);
328+
329+
for (int i = 0; i < structInfoCounts; i++) {
330+
fieldNames.add(structFieldNames.get(i));
331+
final Type fieldType = getCanonicalType(structFieldTypeInfos.get(i));
345332
if (fieldType == null) {
346333
return null;
347334
}
@@ -350,42 +337,8 @@ Type getCanonicalType(final ObjectInspector fieldInspector) {
350337
return TypeRegistry.getTypeRegistry()
351338
.getParameterizedType(TypeEnum.ROW, fieldTypes, fieldNames);
352339
default:
353-
log.info("Currently unsupported type {}, returning Unknown type", fieldInspector.getTypeName());
340+
log.info("Currently unsupported type {}, returning Unknown type", typeInfo.getTypeName());
354341
return BaseType.UNKNOWN;
355342
}
356343
}
357-
358-
// This is protected and extends StandardStructObjectInspector so it can reference MyField
359-
protected static class SameCaseStandardStructObjectInspector extends StandardStructObjectInspector {
360-
private final List<String> realFieldNames;
361-
private final StandardStructObjectInspector structObjectInspector;
362-
363-
public SameCaseStandardStructObjectInspector(final List<String> realFieldNames,
364-
final StandardStructObjectInspector structObjectInspector) {
365-
this.realFieldNames = realFieldNames;
366-
this.structObjectInspector = structObjectInspector;
367-
}
368-
369-
@Override
370-
public List<? extends StructField> getAllStructFieldRefs() {
371-
return structObjectInspector.getAllStructFieldRefs()
372-
.stream()
373-
.map(structField -> (MyField) structField)
374-
.map(field -> new HiveTypeConverter.
375-
SameCaseStandardStructObjectInspector.SameCaseMyField(field.getFieldID(),
376-
realFieldNames.get(field.getFieldID()),
377-
field.getFieldObjectInspector(), field.getFieldComment()))
378-
.collect(Collectors.toList());
379-
}
380-
381-
protected static class SameCaseMyField extends MyField {
382-
public SameCaseMyField(final int fieldID, final String fieldName,
383-
final ObjectInspector fieldObjectInspector,
384-
final String fieldComment) {
385-
super(fieldID, fieldName, fieldObjectInspector, fieldComment);
386-
// Since super lower cases fieldName, this is to restore the original case
387-
this.fieldName = fieldName;
388-
}
389-
}
390-
}
391344
}

metacat-connector-hive/src/test/groovy/com/netflix/metacat/connector/hive/converters/HiveTypeConverterSpec.groovy

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ class HiveTypeConverterSpec extends Specification {
119119
"struct<prediction_date:int,lower_confidence_amt:double,upper_confidence_amt:double,model_short_name:string>",
120120
"struct<prediction_date:int,lower_confidence_amt:int,upper_confidence_amt:int,model_short_name:string>",
121121
"struct<prediction_date:int,prediction_source:string>",
122+
123+
// Nested Type with UpperCase
124+
'array<struct<date:string,countryCodes:array<string>,source:string>>',
125+
"struct<Field3:struct<Nested_Field1:bigint,Nested_Field2:bigint>>",
126+
"struct<Field1:bigint,Field2:bigint,field3:struct<NESTED_Field1:bigint,NesteD_Field2:bigint>>"
122127
]
123128
}
124129

@@ -218,6 +223,10 @@ class HiveTypeConverterSpec extends Specification {
218223
"struct<prediction_date:int,lower_confidence_amt:double,upper_confidence_amt:double,model_short_name:string>" || "struct<prediction_date:int,lower_confidence_amt:double,upper_confidence_amt:double,model_short_name:string>"
219224
"struct<prediction_date:int,lower_confidence_amt:int,upper_confidence_amt:int,model_short_name:string>" || "struct<prediction_date:int,lower_confidence_amt:int,upper_confidence_amt:int,model_short_name:string>"
220225
"struct<prediction_date:int,prediction_source:string>" || "struct<prediction_date:int,prediction_source:string>"
226+
227+
'array<struct<field2:decimal(38 ),countryCodes:array<string>,source:string>>' || 'array<struct<field2:decimal(38),countryCodes:array<string>,source:string>>'
228+
"struct<Field3:struct<Nested_Field1:bigint,Nested_Field2:bigint,Nested_FIELD3:decimal( 38, 9)>>" || "struct<Field3:struct<Nested_Field1:bigint,Nested_Field2:bigint,Nested_FIELD3:decimal(38,9)>>"
229+
"struct<Field1:decimal (38,9 ),Field2:bigint,field3:struct<NESTED_Field1:decimal ( 38,9 ),NesteD_Field2:bigint>>" || "struct<Field1:decimal(38,9),Field2:bigint,field3:struct<NESTED_Field1:decimal(38,9),NesteD_Field2:bigint>>"
221230
}
222231

223232
@Unroll
@@ -233,4 +242,17 @@ class HiveTypeConverterSpec extends Specification {
233242
]
234243
}
235244

245+
@Unroll
246+
def 'case reserve fieldName Fidelity'(String typeString, String expectedString) {
247+
expect:
248+
def result = converter.fromMetacatTypeToJson(converter.toMetacatType(typeString)).toString()
249+
250+
assert result == expectedString
251+
252+
where:
253+
typeString | expectedString
254+
"struct<Field1:bigint,Field2:bigint,field3:struct<nested_Field1:bigint,nested_Field2:bigint>>" | """{"type":"row","fields":[{"name":"Field1","type":"bigint"},{"name":"Field2","type":"bigint"},{"name":"field3","type":{"type":"row","fields":[{"name":"nested_Field1","type":"bigint"},{"name":"nested_Field2","type":"bigint"}]}}]}"""
255+
"array<struct<date:string,countryCodes:array<string>,source:string>>" | """{"type":"array","elementType":{"type":"row","fields":[{"name":"date","type":"string"},{"name":"countryCodes","type":{"type":"array","elementType":"string"}},{"name":"source","type":"string"}]}}"""
256+
"array<struct<Date:string,nestedArray:array<struct<date:string,countryCodes:array<string>,source:string>>>>" | """{"type":"array","elementType":{"type":"row","fields":[{"name":"Date","type":"string"},{"name":"nestedArray","type":{"type":"array","elementType":{"type":"row","fields":[{"name":"date","type":"string"},{"name":"countryCodes","type":{"type":"array","elementType":"string"}},{"name":"source","type":"string"}]}}}]}}"""
257+
}
236258
}
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
{
2+
"format-version" : 1,
3+
"table-uuid" : "6d9ede8f-61cb-469e-a590-4757602df691",
4+
"location" : "file:/tmp/data",
5+
"last-updated-ms" : 1712701649817,
6+
"last-column-id" : 9,
7+
"schema" : {
8+
"type" : "struct",
9+
"schema-id" : 0,
10+
"fields" : [ {
11+
"id" : 1,
12+
"name" : "dateint",
13+
"required" : false,
14+
"type" : "long"
15+
}, {
16+
"id" : 2,
17+
"name" : "info",
18+
"required" : false,
19+
"type" : {
20+
"type" : "struct",
21+
"fields" : [ {
22+
"id" : 3,
23+
"name" : "name",
24+
"required" : false,
25+
"type" : "string"
26+
}, {
27+
"id" : 4,
28+
"name" : "address",
29+
"required" : false,
30+
"type" : {
31+
"type" : "struct",
32+
"fields" : [ {
33+
"id" : 6,
34+
"name" : "NAME",
35+
"required" : false,
36+
"type" : "string"
37+
} ]
38+
}
39+
}, {
40+
"id" : 5,
41+
"name" : "nestedArray",
42+
"required" : false,
43+
"type" : {
44+
"type" : "list",
45+
"element-id" : 7,
46+
"element" : {
47+
"type" : "struct",
48+
"fields" : [ {
49+
"id" : 8,
50+
"name" : "FIELD1",
51+
"required" : false,
52+
"type" : "string"
53+
}, {
54+
"id" : 9,
55+
"name" : "field2",
56+
"required" : false,
57+
"type" : "string"
58+
} ]
59+
},
60+
"element-required" : false
61+
}
62+
} ]
63+
}
64+
} ]
65+
},
66+
"current-schema-id" : 0,
67+
"schemas" : [ {
68+
"type" : "struct",
69+
"schema-id" : 0,
70+
"fields" : [ {
71+
"id" : 1,
72+
"name" : "dateint",
73+
"required" : false,
74+
"type" : "long"
75+
}, {
76+
"id" : 2,
77+
"name" : "info",
78+
"required" : false,
79+
"type" : {
80+
"type" : "struct",
81+
"fields" : [ {
82+
"id" : 3,
83+
"name" : "name",
84+
"required" : false,
85+
"type" : "string"
86+
}, {
87+
"id" : 4,
88+
"name" : "address",
89+
"required" : false,
90+
"type" : {
91+
"type" : "struct",
92+
"fields" : [ {
93+
"id" : 6,
94+
"name" : "NAME",
95+
"required" : false,
96+
"type" : "string"
97+
} ]
98+
}
99+
}, {
100+
"id" : 5,
101+
"name" : "nestedArray",
102+
"required" : false,
103+
"type" : {
104+
"type" : "list",
105+
"element-id" : 7,
106+
"element" : {
107+
"type" : "struct",
108+
"fields" : [ {
109+
"id" : 8,
110+
"name" : "FIELD1",
111+
"required" : false,
112+
"type" : "string"
113+
}, {
114+
"id" : 9,
115+
"name" : "field2",
116+
"required" : false,
117+
"type" : "string"
118+
} ]
119+
},
120+
"element-required" : false
121+
}
122+
} ]
123+
}
124+
} ]
125+
} ],
126+
"partition-spec" : [ {
127+
"name" : "dateint",
128+
"transform" : "identity",
129+
"source-id" : 1,
130+
"field-id" : 1000
131+
} ],
132+
"default-spec-id" : 0,
133+
"partition-specs" : [ {
134+
"spec-id" : 0,
135+
"fields" : [ {
136+
"name" : "dateint",
137+
"transform" : "identity",
138+
"source-id" : 1,
139+
"field-id" : 1000
140+
} ]
141+
} ],
142+
"last-partition-id" : 1000,
143+
"default-sort-order-id" : 0,
144+
"sort-orders" : [ {
145+
"order-id" : 0,
146+
"fields" : [ ]
147+
} ],
148+
"properties" : {
149+
"field.metadata.json" : "{\"1\":{},\"2\":{},\"3\":{},\"4\":{},\"5\":{},\"6\":{},\"8\":{},\"9\":{}}"
150+
},
151+
"current-snapshot-id" : -1,
152+
"refs" : { },
153+
"snapshots" : [ ],
154+
"statistics" : [ ],
155+
"snapshot-log" : [ ],
156+
"metadata-log" : [ ]
157+
}

0 commit comments

Comments
 (0)