Add row_sums() (#552)

strengejacke · web-flow · commit 213b9d521eaa · 2024-10-11T11:50:50.000+02:00
* Draft `row_sums()` as complement to `row_means()`

* version
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.6
+Version: 0.13.0.7
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),
diff --git a/NAMESPACE b/NAMESPACE
@@ -298,6 +298,7 @@ export(reverse)
 export(reverse_scale)
 export(row_count)
 export(row_means)
+export(row_sums)
 export(row_to_colnames)
 export(rowid_as_column)
 export(rownames_as_column)
diff --git a/NEWS.md b/NEWS.md
@@ -11,6 +11,9 @@ CHANGES
   variables, can now also be a character vector with quoted variable names,
   including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).
 
+* New function `row_sums()`, to calculate row sums (optionally with minimum
+  amount of valid values), as complement to `row_means()`.
+
 * New function `row_count()`, to count specific values row-wise.
 
 BUG FIXES
diff --git a/R/row_means.R b/R/row_means.R
@@ -1,37 +1,41 @@
-#' @title Row means (optionally with minimum amount of valid values)
+#' @title Row means or sums (optionally with minimum amount of valid values)
 #' @name row_means
-#' @description This function is similar to the SPSS `MEAN.n` function and computes
-#' row means from a data frame or matrix if at least `min_valid` values of a row are
-#' valid (and not `NA`).
+#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n`
+#' function and computes row means or row sums from a data frame or matrix if at
+#' least `min_valid` values of a row are valid (and not `NA`).
 #'
-#' @param data A data frame with at least two columns, where row means are applied.
+#' @param data A data frame with at least two columns, where row means or row
+#' sums are applied.
 #' @param min_valid Optional, a numeric value of length 1. May either be
 #' - a numeric value that indicates the amount of valid values per row to
-#'   calculate the row mean;
+#'   calculate the row mean or row sum;
 #' - or a value between `0` and `1`, indicating a proportion of valid values per
-#'   row to calculate the row mean (see 'Details').
+#'   row to calculate the row mean or row sum (see 'Details').
 #' - `NULL` (default), in which all cases are considered.
 #'
 #' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
 #' @param digits Numeric value indicating the number of decimal places to be
 #' used for rounding mean values. Negative values are allowed (see 'Details').
 #' By default, `digits = NULL` and no rounding is used.
 #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
-#' before calculating row means. Only applies if `min_valuid` is not specified.
+#' before calculating row means or row sums. Only applies if `min_valid` is not
+#' specified.
 #' @param verbose Toggle warnings.
 #' @inheritParams extract_column_names
 #'
-#' @return A vector with row means for those rows with at least `n` valid values.
+#' @return A vector with row means (for `row_means()`) or row sums (for
+#' `row_sums()`) for those rows with at least `n` valid values.
 #'
-#' @details Rounding to a negative number of `digits` means rounding to a power of
-#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred.
-#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0`
-#' to `ncol(data)`. If a row in the data frame has at least `min_valid`
-#' non-missing values, the row mean is returned. If `min_valid` is a non-integer
-#' value from 0 to 1, `min_valid` is considered to indicate the proportion of
-#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must
-#' have at least `ncol(data) * min_valid` non-missing values for the row mean
-#' to be calculated. See 'Examples'.
+#' @details Rounding to a negative number of `digits` means rounding to a power
+#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest
+#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value
+#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid`
+#' non-missing values, the row mean or row sum is returned. If `min_valid` is a
+#' non-integer value from 0 to 1, `min_valid` is considered to indicate the
+#' proportion of required non-missing values per row. E.g., if
+#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid`
+#' non-missing values for the row mean or row sum to be calculated. See
+#' 'Examples'.
 #'
 #' @examples
 #' dat <- data.frame(
@@ -49,6 +53,7 @@
 #'
 #' # needs at least 4 non-missing values per row
 #' row_means(dat, min_valid = 4) # 1 valid return value
+#' row_sums(dat, min_valid = 4) # 1 valid return value
 #'
 #' # needs at least 3 non-missing values per row
 #' row_means(dat, min_valid = 3) # 2 valid return values
@@ -61,6 +66,7 @@
 #'
 #' # needs at least 50% of non-missing values per row
 #' row_means(dat, min_valid = 0.5) # 3 valid return values
+#' row_sums(dat, min_valid = 0.5)
 #'
 #' # needs at least 75% of non-missing values per row
 #' row_means(dat, min_valid = 0.75) # 2 valid return values
@@ -84,34 +90,52 @@ row_means <- function(data,
     verbose = verbose
   )
 
-  if (is.null(select) || length(select) == 0) {
-    insight::format_error("No columns selected.")
-  }
+  # prepare data, sanity checks
+  data <- .prepare_row_data(data, select, min_valid, verbose)
 
-  data <- .coerce_to_dataframe(data[select])
+  # calculate row means
+  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean")
+}
 
-  # n must be a numeric, non-missing value
-  if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
-    insight::format_error("`min_valid` must be a numeric value of length 1.")
-  }
 
-  # make sure we only have numeric values
-  numeric_columns <- vapply(data, is.numeric, TRUE)
-  if (!all(numeric_columns)) {
-    if (verbose) {
-      insight::format_alert("Only numeric columns are considered for calculation.")
-    }
-    data <- data[numeric_columns]
-  }
+#' @rdname row_means
+#' @export
+row_sums <- function(data,
+                     select = NULL,
+                     exclude = NULL,
+                     min_valid = NULL,
+                     digits = NULL,
+                     ignore_case = FALSE,
+                     regex = FALSE,
+                     remove_na = FALSE,
+                     verbose = TRUE) {
+  # evaluate arguments
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case = ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  # prepare data, sanity checks
+  data <- .prepare_row_data(data, select, min_valid, verbose)
+
+  # calculate row sums
+  .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum")
+}
 
-  # check if we have a data framme with at least two columns
-  if (ncol(data) < 2) {
-    insight::format_error("`data` must be a data frame with at least two numeric columns.")
-  }
 
-  # proceed here if min_valid is not NULL
+# helper ------------------------
+
+# calculate row means or sums
+.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) {
   if (is.null(min_valid)) {
-    out <- rowMeans(data, na.rm = remove_na)
+    # calculate row means or sums for complete data
+    out <- switch(fun,
+      mean = rowMeans(data, na.rm = remove_na),
+      rowSums(data, na.rm = remove_na)
+    )
   } else {
     # is 'min_valid' indicating a proportion?
     decimals <- min_valid %% 1
@@ -124,9 +148,12 @@ row_means <- function(data,
       insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
     }
 
-    # row means
+    # row means or sums
     to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
-    out <- rowMeans(data, na.rm = TRUE)
+    out <- switch(fun,
+      mean = rowMeans(data, na.rm = TRUE),
+      rowSums(data, na.rm = TRUE)
+    )
     out[to_na] <- NA
   }
 
@@ -137,3 +164,34 @@ row_means <- function(data,
 
   out
 }
+
+
+# check that data is in shape for row means or row sums
+.prepare_row_data <- function(data, select, min_valid, verbose) {
+  if (is.null(select) || length(select) == 0) {
+    insight::format_error("No columns selected.")
+  }
+
+  data <- .coerce_to_dataframe(data[select])
+
+  # n must be a numeric, non-missing value
+  if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
+    insight::format_error("`min_valid` must be a numeric value of length 1.")
+  }
+
+  # make sure we only have numeric values
+  numeric_columns <- vapply(data, is.numeric, TRUE)
+  if (!all(numeric_columns)) {
+    if (verbose) {
+      insight::format_alert("Only numeric columns are considered for calculation.")
+    }
+    data <- data[numeric_columns]
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+
+  data
+}
diff --git a/man/row_means.Rd b/man/row_means.Rd
diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R
@@ -1,4 +1,4 @@
-test_that("row_means", {
+test_that("row_means/sums", {
   d_mn <- data.frame(
     c1 = c(1, 2, NA, 4),
     c2 = c(NA, 2, NA, 5),
@@ -14,14 +14,21 @@ test_that("row_means", {
   expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1)
   expect_message(row_means(iris), regex = "Only numeric")
   expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE)
+  expect_equal(row_sums(d_mn, min_valid = 4), c(NA, 11, NA, NA), tolerance = 1e-3)
+  expect_equal(row_sums(d_mn, min_valid = 3), c(NA, 11, NA, 17), tolerance = 1e-3)
+  expect_message(row_sums(iris), regex = "Only numeric")
 })
 
-test_that("row_means, errors or messages", {
+test_that("row_means/sums, errors or messages", {
   data(iris)
   expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns")
+  expect_error(expect_warning(row_sums(iris, select = "abc")), regex = "No columns")
   expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric")
   expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value")
   expect_error(row_means(iris, min_valid = "a"), regex = "numeric value")
   expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric")
   expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE))
+  expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric")
+  expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric")
+  expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE))
 })