import org.apache.spark.sql.functions.{year, month, day, hour, bucket}
val matches = spark.read.option("header", "true")
.option("inferSchema", "true")
.csv("/home/iceberg/data/matches.csv")
spark.sql(f"""DROP TABLE IF EXISTS bootcamp.matches_bucketed PURGE""")
spark.sql(f"""
CREATE OR REPLACE TABLE bootcamp.matches_bucketed (
match_id STRING,
is_team_game BOOLEAN,
playlist_id STRING,
completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (years(completion_date), bucket(16, match_id));
""")
matches.select($"match_id",$"is_team_game", $"playlist_id", $"completion_date")
.writeTo("bootcamp.matches_bucketed ")
.overwritePartitions()
// FOR VALIDATION : 16 buckets * 2 years = 32 files
spark.sql(f"""
SELECT
COUNT(*)
FROM bootcamp.matches_bucketed .files
""").show()