|
| 1 | +#' Read files using DuckDB |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' `read_file_duckdb()` uses arbitrary readers to read data. |
| 5 | +#' See <https://duckdb.org/docs/data/overview> for a documentation |
| 6 | +#' of the available functions and their options. |
| 7 | +#' To read multiple files with the same schema, |
| 8 | +#' pass a wildcard or a character vector to the `path` argument, |
| 9 | +#' |
| 10 | +#' @inheritParams rlang::args_dots_empty |
| 11 | +#' |
| 12 | +#' @param path Path to files, glob patterns `*` and `?` are supported. |
| 13 | +#' @param table_function The name of a table-valued |
| 14 | +#' DuckDB function such as `"read_parquet"`, |
| 15 | +#' `"read_csv"`, `"read_csv_auto"` or `"read_json"`. |
| 16 | +#' @param prudence Memory protection, controls if DuckDB may convert |
| 17 | +#' intermediate results in DuckDB-managed memory to data frames in R memory. |
| 18 | +#' |
| 19 | +#' - `"thrifty"`: up to a maximum size of 1 million cells, |
| 20 | +#' - `"lavish"`: regardless of size, |
| 21 | +#' - `"stingy"`: never. |
| 22 | +#' |
| 23 | +#' The default is `"thrifty"` for the ingestion functions, |
| 24 | +#' and may be different for other functions. |
| 25 | +#' See `vignette("prudence")` for more information. |
| 26 | +#' |
| 27 | +#' @param options Arguments to the DuckDB function |
| 28 | +#' indicated by `table_function`. |
| 29 | +#' |
| 30 | +#' @inheritSection duckdb_tibble Fine-tuning prudence |
| 31 | +#' |
| 32 | +#' @return A duckplyr frame, see [as_duckdb_tibble()] for details. |
| 33 | +#' |
| 34 | +#' @seealso [read_csv_duckdb()], [read_parquet_duckdb()], [read_json_duckdb()] |
| 35 | +#' |
| 36 | +#' @rdname read_file_duckdb |
| 37 | +#' @export |
| 38 | +read_file_duckdb <- function( |
| 39 | + path, |
| 40 | + table_function, |
| 41 | + ..., |
| 42 | + prudence = c("thrifty", "lavish", "stingy"), |
| 43 | + options = list() |
| 44 | +) { |
| 45 | + check_dots_empty() |
| 46 | + |
| 47 | + if (!rlang::is_character(path)) { |
| 48 | + cli::cli_abort("{.arg path} must be a character vector.") |
| 49 | + } |
| 50 | + |
| 51 | + if (length(path) != 1) { |
| 52 | + path <- list(path) |
| 53 | + } |
| 54 | + |
| 55 | + duckfun(table_function, c(list(path), options), prudence = prudence) |
| 56 | +} |
| 57 | + |
| 58 | +duckfun <- function(table_function, args, ..., prudence) { |
| 59 | + if (!is.list(args)) { |
| 60 | + cli::cli_abort("{.arg args} must be a list.") |
| 61 | + } |
| 62 | + if (length(args) == 0) { |
| 63 | + cli::cli_abort("{.arg args} must not be empty.") |
| 64 | + } |
| 65 | + |
| 66 | + # FIXME: For some reason, it's important to create an alias here |
| 67 | + con <- get_default_duckdb_connection() |
| 68 | + |
| 69 | + # FIXME: Provide better duckdb API |
| 70 | + path <- args[[1]] |
| 71 | + options <- args[-1] |
| 72 | + |
| 73 | + rel <- duckdb$rel_from_table_function( |
| 74 | + con, |
| 75 | + table_function, |
| 76 | + list(path), |
| 77 | + options |
| 78 | + ) |
| 79 | + |
| 80 | + meta_rel_register_file(rel, table_function, path, options) |
| 81 | + |
| 82 | + rel_to_df(rel, prudence = prudence) |
| 83 | +} |
0 commit comments