SciViews functions (mainly from collapse and data.table) to manipulate data frames

A SciViews::R version of the tidyverse functions in {dplyr} and {tidyr} with standard evaluation, and non-standard evaluation trough formulas. These functions end with an underscore _. Avoid mixing tidy, speedy and SciViews functions in the same pipeline.

list_sciviews_functions()

all_of(x)

as.grouped_df(x, ...)

as_grouped_df(x, ...)

# Default S3 method
as.grouped_df(x, ...)

# S3 method for class 'grouped_df'
as.grouped_df(x, ...)

# S3 method for class 'GRP_df'
as.grouped_df(x, ...)

# S3 method for class 'grouped_df'
print(x, ...)

group_vars_(.data = (.), return = "names")

group_rows_(.data = (.))

group_data_(.data = (.))

group_indices_(.data = (.), ...)

group_keys_(.data = (.), ...)

groups_(.data = (.))

group_size_(.data = (.))

n_groups_(.data = (.))

group_by_(
  .data = (.),
  ...,
  .add = FALSE,
  .drop = NULL,
  .sort = get_collapse("sort"),
  .decreasing = FALSE,
  .na.last = TRUE,
  .return.groups = TRUE,
  .return.order = .sort,
  .method = "auto"
)

ungroup_(.data = (.), ..., .na.last = TRUE, .method = "auto")

rename_(.data = (.), ...)

rename_with_(.data = (.), .fn, .cols = ~everything(), ...)

filter_(.data = (.), ..., .by = NULL, .preserve = FALSE)

select_(.data = (.), ...)

mutate_(
  .data = (.),
  ...,
  .by = NULL,
  .keep = "all",
  .before = NULL,
  .after = NULL,
  .cols = NULL
)

transmute_(.data, ...)

summarise_(
  .data = (.),
  ...,
  .by = NULL,
  .groups = "drop_last",
  .keep.group_vars = TRUE,
  .cols = NULL
)

summarize_(
  .data = (.),
  ...,
  .by = NULL,
  .groups = "drop_last",
  .keep.group_vars = TRUE,
  .cols = NULL
)

reframe_(
  .data,
  ...,
  .by = NULL,
  .groups = "drop",
  .keep.group_vars = TRUE,
  .cols = NULL
)

arrange_(
  .data = (.),
  ...,
  .by_group = FALSE,
  .locale = "C",
  .decreasing = FALSE
)

pull_(.data = (.), var = -1, name = NULL, ...)

join_(
  x,
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  keep = NULL,
  na_matches = c("na", "never"),
  multiple = "all",
  unmatched = "drop",
  relationship = NULL,
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL,
  how = "full"
)

right_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  keep = NULL,
  na_matches = c("na", "never"),
  multiple = "all",
  unmatched = "drop",
  relationship = NULL,
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

full_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  keep = NULL,
  na_matches = c("na", "never"),
  multiple = "all",
  relationship = NULL,
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

left_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  keep = NULL,
  na_matches = c("na", "never"),
  multiple = "all",
  unmatched = "drop",
  relationship = NULL,
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

inner_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  suffix = c(".x", ".y"),
  ...,
  keep = NULL,
  na_matches = c("na", "never"),
  multiple = "all",
  unmatched = "drop",
  relationship = NULL,
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

semi_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  ...,
  na_matches = c("na", "never"),
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

anti_join_(
  x = (.),
  y,
  by = NULL,
  copy = FALSE,
  ...,
  na_matches = c("na", "never"),
  sort = FALSE,
  verbose = 0,
  column = NULL,
  attr = NULL
)

bind_rows_(..., .id = NULL, .use_names = TRUE, .fill = TRUE)

bind_cols_(
  ...,
  .name_repair = c("unique", "universal", "check_unique", "minimal")
)

slice_(.data = (.), ..., .by = NULL, .preserve = NULL)

slice_head_(.data = (.), ..., n = 1L, prop, by = NULL, sort = TRUE)

slice_tail_(.data = (.), ..., n = 1L, prop, by = NULL, sort = TRUE)

count_(
  .data = (.),
  ...,
  wt = NULL,
  name = "n",
  sort = FALSE,
  decreasing = TRUE,
  .drop = TRUE,
  add = FALSE
)

tally_(.data = (.), wt = NULL, name = "n", sort = FALSE, decreasing = TRUE)

add_count_(
  .data = (.),
  ...,
  wt = NULL,
  name = "n",
  sort = FALSE,
  decreasing = TRUE,
  .drop = TRUE
)

add_tally_(.data = (.), wt = NULL, name = "n", sort = FALSE, decreasing = TRUE)

distinct_(.data = (.), ..., .keep_all = FALSE, .method = "auto")

drop_na_(.data = (.), ..., .na.attr = FALSE, .prop = 0)

replace_na_(.data = (.), replace, ..., v = NULL)

pivot_longer_(
  .data = (.),
  cols,
  ...,
  cols_vary = "fastest",
  names_to = "name",
  names_prefix = NULL,
  values_to = "value",
  values_drop_na = FALSE,
  factor = FALSE
)

pivot_wider_(
  .data = (.),
  ...,
  id_cols = NULL,
  id_expand = FALSE,
  names_from = name,
  names_prefix = "",
  names_vary = "fastest",
  values_from = value,
  values_fill = NULL,
  values_fn = "last",
  drop = TRUE,
  sort = FALSE
)

uncount_(.data = (.), weights, ..., .remove = TRUE, .id = NULL)

unite_(.data = (.), col, ..., sep = "_", remove = TRUE, na.rm = FALSE)

fill_(.data = (.), ..., .direction = "down")

separate_(
  .data = (.),
  col,
  into,
  sep = "[^[:alnum:]]+",
  remove = TRUE,
  convert = FALSE,
  extra = "warn",
  fill = "warn",
  fixed = FALSE,
  ...
)

Arguments

x: A data frame (data.frame, data.table or tibble's tbl_df).
...: Arguments dependent to the context of the function and most of the time, not evaluated in a standard way (cf. the tidyverse approach).
.data: A data frame (data.frame, data.table or tibble's tbl_df)
return: What to return: "data" or 1, "unique" or 2 for unique rows of grouping columns, "names" or 3 (default) for names of grouping columns, "indices" or 4 for integer indices of grouping columns, "named_indices" or 5 for named indices, "logicial" or 6 for logical selection vector of grouping columns, or "named_logical" or 7 for named logical.
.add: If TRUE, the grouping variables are added to the existing ones.
.drop: Are levels with no observations dropped (TRUE by default).
.sort: If TRUE groups are sorted.
.decreasing: Is sorting done in decreasing order (FALSE by default)?
.na.last: How to treat missing values in groups? Assign them to the last group by default (TRUE).
.return.groups: If TRUE, the grouping variables are returned in the GRP object (default).
.return.order: If TRUE, the order of the grouping variables is returned in the object (by default, same value as sort=).
.method: The algorithm to use for grouping: "radix", "hash", or "auto" (by default). "auto" chose "radix" when sort = TRUE and "hash" otherwise.
.fn: A function to use.
.cols: The list of the column where to apply the transformation. For the moment, only all existing columns, which means .cols = everything() is implemented
.by: A list of names of the columns to use for grouping the data.
.preserve: When data is grouped, do we preserve grouping or recalculate it according to the new data frame obtained?
.keep: Which columns to keep. The default is "all", possible values are "used", "unused", or "none" (see mutate()).
.before: Place new columns before this one.
.after: Place new columns after this one.
.groups: How to treat the grouping variables in the result? Possible values are "drop_last" (default), "drop" (no grouping variables), "keep" (keep all grouping variables), or "rowwise" (not implemented yet).
.keep.group_vars: If TRUE (by default), the grouping variables are kept in the result.
.by_group: Logical. If TRUE rows are first arranger by the grouping variables in any. FALSE by default.
.locale: The locale to sort character vectors in. If NULL(default), use "C" locale.
var: A variable specified as a name, a positive or a negative integer (counting from the end). The default is -1 and returns last variable.
name: The name of the new column in the output (n by default, and no existing column must have this name, or an error is generated).4
y: A second data frame.
by: A list of names of the columns to use for joining the two data frames. Could also be a join specification created with dplyr::join_by(), but in this case, calculation is delegated to dplyr's join methods.
copy: This argument is there for compatibility with the "t" matching functions, but it is not used here.
suffix: The suffix to the column names to use to differentiate the columns that come from the first or the second data frame. By default it is c(".x", ".y").
keep: Should the join keys from both x and y be preserved in the output? If NULL, the default, joins on equality retain only the keys from x, while joins on inequality retain the keys from both inputs. If TRUE, all keys from both inputs are retained. If FALSE, only keys from x are retained. For right and full joins, the data in key columns corresponding to rows that only exist in y are merged into the key columns from x. Can't be used when joining on inequality conditions. If keep = TRUE, calculation is delegated to dplyr join methods.
na_matches: Should two NA or two NaN values match? "na", the default, treats two NA or two NaN values as equal, like %in%, match(), and merge(). "never" treats two NA or two NaN values as different, and will never match them together or to any other values. This is similar to joins for database sources and to base::merge(incomparables = NA). If "never", calculation is delegated to dplyr join methods.
multiple: Handling of rows in x with multiple matches in y. For each row of x: "all", the default, returns every match detected in y. This is the same behavior as SQL. "any" returns one match detected in y, with no guarantees on which match will be returned. It is often faster than "first" and "last" in dplyr, but avoid it here. "first" returns the first match detected in y. "last" returns the last match detected in y. For "any" and "last", calculation is delegated to dplyr join methods, and in the case of right join, also for "first"..
unmatched: How should unmatched keys that would result in dropped rows be handled? "drop" drops unmatched keys from the result. "error" throws an error if unmatched keys are detected. Also, a named list of the form list(x = 1, y = 0.5, fail = "warning")can be used when calculation is not delegated to dplyr. The first two elements are the proportions that must match, and the third element is "message", "warning", or "error".
relationship: Handling of the expected relationship between the keys of x and y. If the expectations chosen from the list below are invalidated, an error is thrown. NULL, the default, doesn't expect there to be any relationship between x and y. However, for equality joins it will check for a many-to-many relationship (which is typically unexpected) and will warn if one occurs, encouraging you to either take a closer look at your inputs or make this relationship explicit by specifying "many-to-many". "one-to-one" expects: Each row in x matches at most 1 row in y. Each row in y matches at most 1 row in x. "one-to-many" expects: Each row in y matches at most 1 row in x. "many-to-one" expects: Each row in x matches at most 1 row in y. "many-to-many" doesn't perform any relationship checks, but is provided to allow you to be explicit about this relationship if you know it exists. relationship doesn't handle cases where there are zero matches. For that, see unmatched.
sort: If TRUE largest group will be shown on top.
verbose: integer. Prints information about the join. One of 0 (off), 1 (default) or 2 (additionally prints the classes of the by columns).
column: name for an extra column to generate in the output indicating which dataset a record came from. TRUE calls this column ".join", or give another name.
attr: name for attribute providing information about the join performed (including the output of collapse::fmatch()) to the result. TRUE calls this attribute "join.match" or give your own name. Note: this also invokes the count argument to collapse::fmatch().
how: Can be "full" (default), "inner", "left", "right", "semi", or "anti".
.id: The name of the column for the origin id, either names if all other arguments are named, or numbers.
.use_names: If TRUE (default), bind by matching names, if FALSE, bind by position. If NULL, warns if all items do not have the same name in the same order, and then proceeds as if FALSE (but will be as if TRUE in the future).
.fill: If TRUE (default), fills missing columns with NA or NULL for missing list columns, if FALSE, do not fill.
.name_repair: How should the name be "repaired" to avoid duplicate column names? See dplyr::bind_cols() for more details.
n: Number of rows to keep
prop: Proportion of rows to keep, between 0 and 1. Provide either n, or prop but not both simultaneously. If none is provided, n = 1 is used.
wt: Frequency weights. Can be NULL or a variable. Use data masking.
decreasing: Is sorting done in decreasing order (FALSE by default)?
add: Add counts to the data frame (FALSE by default).
.keep_all: If TRUE keep all variables in .data.
.na.attr: logical. TRUE adds an attribute containing the removed cases. For compatibility reasons this is exactly the same format as na.omit(), i.e. the attribute is called "na.action" and of class omit
.prop: numeric. The proportion missing values in each case for the case to be considered as missing required to keep a
replace: If data is a vector, a unique value to replace NAs, otherwise, a list of values, one per column of the data frame.
v: a vector where to replace NAs.
cols: A selection of the columns using tidy-select syntax, seetidyr::pivot_longer().
cols_vary: character. Either "fastest" or "slowest". If "fastest" (default), keep individual rows from cols close together. If "slowest", keeps individual columns from `cols' close together.
names_to: A character vector with the name or names of the columns for the names.
names_prefix: character. A regular expression used to remove matching text from the start of each variable name.
values_to: A string with the name of the column that receives the values.
values_drop_na: logical. If TRUE, drop rows with only NAs in the values_to column.
factor: logical. If TRUE, convert the names and labels into factors, if FALSE (default) leave then as character strings (but slower for subsequent filtering).
id_cols: A set of columns that uniquely identify each observation.
id_expand: logical. If TRUE, expand the id_cols.
names_from: The column or columns containing the names (use tidy selection and do not quote the names).
names_vary: character. How the various column names are made: "fastest" (default), "slowest", "transpose", or "slowtranspose".
values_from: Idem for the column or columns that contain the values.
values_fill: Optionally, a scalar value to use for missing values.
values_fn: Either the name of an internal function (as a string) : "first", "last" (default), "count", "sum", "mean", "min", or "max". Could also be a formula calling an external function with first argument being .x like ~fmedian(.x, na.rm = TRUE).`
drop: Drop unused factor levels or not.
weights: A vector of weight to use to "uncount" data.
.remove: If TRUE, and weights is the name of a column, that column is removed from data.
col: The name quoted or not of the new column with united variable.
sep: Separator to use between values for united or separated columns.
remove: If TRUE the initial columns that are separated are also removed from data.
na.rm: If TRUE, NAs are eliminated before uniting the values.
.direction: Direction in which to fill missing data: "down" (by default), "up", or "downup" (first down, then up), "updown" (the opposite).
into: Name of the new column to put separated variables. Use NA for items to drop.
convert: If 'TRUE resulting values are converted into numeric, integer or logical.
extra: When sep is a character vector what happens when there are too many pieces: "warn" (default) issue a warning and drop extra items, "drop" does the same without warning and "merge" merges the extra items with the last one.
fill: When sep is a character vector what happens when there are not enough pieces: "warn" (default) issue a warning and fill with NAs at right, so does without warning "right", and "left" fills with NAs at left.
fixed: logical. If TRUE, sep is a fixed string, otherwise it is a (perl) regular expression.
data: A data frame, or for replace_na() a vector or a data frame.

Value

See corresponding "non-SciViews" function for the full help page with indication of the return values.

Note

The summarise_() function does not support n() as does dplyr::summarise(). You can use svBase::fn() instead, but then, you must give a variable name as argument. The svBase::fn() alternative can also be used in dplyr::summarise() for homogeneous syntax between the two. From {dplyr}, the slice_min(), slice_max() and splice_sample() functions are not added yet. From {tidyr} tidyr::expand(), tidyr::chop(), tidyr::unchop(), tidyr::nest(), tidyr::unnest(), tidyr::unnest_longer(), tidyr::unnest_wider(), tidyr::hoist(), tidyr::pack() and tidyr::unpack() are not implemented yet.

Examples

# TODO...