loading...

One goal of the {svTidy} package is to provide an interface that is similar to {dplyr} and {tidyr} to fast code, possibly using the {data.table} or {collapse} packages under the hood.

In this document, we compare speed and memory use of {svTidy} with {dplyr} and {tidyr}. Note: this document is a work in progress and is lacking many important parts in its present state.

data.table::setDTthreads(percent = 75)
(.nthreads <- data.table::getDTthreads())
#> [1] 3
options(collapse_nthreads = .nthreads)
options(collapse_na.rm = FALSE)
options(collapse_mask = "all")
#SciViews::R
#iris |>
#  sgroup_by(Species) |>
#  ssummarise(n = n(), mean = fmean(Sepal.Length))

arrange_()

data(mtcars)
#mtcars_ <- svBase::as_dtrm(mtcars)
#mtcars <- tibble::as_tibble(mtcars_)
bench::mark(check = FALSE,
  dplyr     = arrange(mtcars, cyl, desc(vs)),
  svTidySE  = arrange_(mtcars, 'cyl', '-vs'),
  svTidyNSE = arrange_(mtcars, ~cyl, ~ -vs))
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr           2ms   2.07ms      480.    1.81MB     8.38
#> 2 svTidySE       45µs  49.14µs    19745.  312.84KB     8.51
#> 3 svTidyNSE    75.6µs  81.63µs    11908.   87.22KB    10.5
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
  dplyr     = arrange(babynames, sex, desc(n)),
  svTidySE  = arrange_(babynames, 'sex', '-n'),
  svTidyNSE = arrange_(babynames, ~sex, ~ -n))
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        74.7ms   74.7ms      13.4   343.5MB     66.9
#> 2 svTidySE     25.6ms   26.3ms      36.0    73.4MB     43.2
#> 3 svTidyNSE    25.3ms   26.5ms      36.6    73.4MB     48.7

bind_rows_() and bind_cols_()

df1 <- data.frame(x = 1:2, y = letters[1:2])
bench::mark(check = FALSE,
  dplyr       = bind_rows(df1, df1),
  svTidy      = bind_rows_(df1, df1),
  svTidy2     = bind_rows_(list(df1, df1)),
  data.table  = rbindlist(list(df1, df1)),
  base        = rbind(df1, df1))
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr          65µs   71.2µs    13503.   96.35KB     9.20
#> 2 svTidy      143.5µs  153.8µs     6294.    1.94MB     8.50
#> 3 svTidy2     143.2µs  153.2µs     6360.   32.25KB     8.49
#> 4 data.table   81.8µs   86.4µs    11228.   16.12KB     8.56
#> 5 base         56.8µs     61µs    15691.        0B    10.7
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
  dplyr      = bind_rows(babynames, babynames),
  svTidy     = bind_rows_(babynames, babynames),
  svTidy2    = bind_rows_(list(babynames, babynames)),
  data.table = rbindlist(list(babynames, babynames)),
  base       = rbind(babynames, babynames))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr       87.62ms 113.19ms     6.93      362MB    5.19 
#> 2 svTidy        1.45s    1.45s     0.690     368MB    1.38 
#> 3 svTidy2       1.39s    1.39s     0.721     338MB    0.721
#> 4 data.table  42.92ms  68.11ms    15.2       132MB    7.62 
#> 5 base       166.14ms 183.71ms     5.53      264MB    3.69
df1 <- data.frame(x = 1:2, y = letters[1:2])
df2 <- data.frame(z = 10:11, w = factor(5:6))
bench::mark(check = FALSE,
  dplyr      = bind_cols(df1, df2),
  svTidy     = bind_cols_(df1, df2),
  data.table = cbind(df1, df2),
  base       = cbind(df1, df2))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        97.1µs  105.2µs     9294.    32.9KB     2.13
#> 2 svTidy        101µs  110.1µs     8870.    13.9KB     4.39
#> 3 data.table   25.8µs   27.9µs    35058.    17.7KB     3.51
#> 4 base         25.7µs   27.9µs    35096.        0B     3.51
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
  dplyr      = bind_cols(babynames, babynames),
  svTidy     = bind_cols_(babynames, babynames),
  data.table = rbind(babynames, babynames),
  base       = rbind(babynames, babynames))
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#>  `year` -> `year...1`
#>  `sex` -> `sex...2`
#>  `name` -> `name...3`
#>  `n` -> `n...4`
#>  `prop` -> `prop...5`
#>  `year` -> `year...6`
#>  `sex` -> `sex...7`
#>  `name` -> `name...8`
#>  `n` -> `n...9`
#>  `prop` -> `prop...10`
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        67.7ms     68ms     13.9      232MB     1.99
#> 2 svTidy       68.1ms     69ms      9.11     356KB     1.82
#> 3 data.table  171.2ms    207ms      4.96     264MB     4.96
#> 4 base        243.6ms    284ms      3.52     264MB     5.28

filter_()

data(mtcars)
bench::mark(check = FALSE,
  dplyr     = filter(mtcars, mpg > 20),
  svTidySE  = filter_(mtcars, mtcars$mpg > 20),
  svTidyNSE = filter_(mtcars, ~mpg > 20))
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr       529.9µs  574.2µs     1730.   208.1KB     2.13
#> 2 svTidySE     32.4µs   35.5µs    27522.    45.5KB     2.75
#> 3 svTidyNSE    36.9µs   40.4µs    24206.    42.7KB     4.84
data(babynames, package = 'babynames')
babynames_dt <- data.table::as.data.table(babynames)
bench::mark(check = FALSE,
  dplyr      = filter(babynames, n > 1000),
  svTidySE   = filter_(babynames, babynames$n > 1000),
  svTidyNSE  = filter_(babynames, ~n > 1000),
  data.table = babynames_dt[n > 1000])
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        12.5ms  12.59ms      78.7    31.4MB     16.6
#> 2 svTidySE     5.04ms   5.69ms     177.     16.7MB     17.1
#> 3 svTidyNSE    5.08ms   5.61ms     178.     16.7MB     13.3
#> 4 data.table   8.17ms   8.22ms     121.     16.9MB     13.8