vignettes/Performance.Rmd
Performance.RmdOne goal of the {svTidy} package is to provide an interface that is similar to {dplyr} and {tidyr} to fast code, possibly using the {data.table} or {collapse} packages under the hood.
In this document, we compare speed and memory use of {svTidy} with {dplyr} and {tidyr}. Note: this document is a work in progress and is lacking many important parts in its present state.
data.table::setDTthreads(percent = 75)
(.nthreads <- data.table::getDTthreads())
#> [1] 3
options(collapse_nthreads = .nthreads)
options(collapse_na.rm = FALSE)
options(collapse_mask = "all")
#SciViews::R
#iris |>
# sgroup_by(Species) |>
# ssummarise(n = n(), mean = fmean(Sepal.Length))arrange_()
data(mtcars)
#mtcars_ <- svBase::as_dtrm(mtcars)
#mtcars <- tibble::as_tibble(mtcars_)
bench::mark(check = FALSE,
dplyr = arrange(mtcars, cyl, desc(vs)),
svTidySE = arrange_(mtcars, 'cyl', '-vs'),
svTidyNSE = arrange_(mtcars, ~cyl, ~ -vs))
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 2ms 2.07ms 480. 1.81MB 8.38
#> 2 svTidySE 45µs 49.14µs 19745. 312.84KB 8.51
#> 3 svTidyNSE 75.6µs 81.63µs 11908. 87.22KB 10.5
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
dplyr = arrange(babynames, sex, desc(n)),
svTidySE = arrange_(babynames, 'sex', '-n'),
svTidyNSE = arrange_(babynames, ~sex, ~ -n))
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 74.7ms 74.7ms 13.4 343.5MB 66.9
#> 2 svTidySE 25.6ms 26.3ms 36.0 73.4MB 43.2
#> 3 svTidyNSE 25.3ms 26.5ms 36.6 73.4MB 48.7bind_rows_() and bind_cols_()
df1 <- data.frame(x = 1:2, y = letters[1:2])
bench::mark(check = FALSE,
dplyr = bind_rows(df1, df1),
svTidy = bind_rows_(df1, df1),
svTidy2 = bind_rows_(list(df1, df1)),
data.table = rbindlist(list(df1, df1)),
base = rbind(df1, df1))
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 65µs 71.2µs 13503. 96.35KB 9.20
#> 2 svTidy 143.5µs 153.8µs 6294. 1.94MB 8.50
#> 3 svTidy2 143.2µs 153.2µs 6360. 32.25KB 8.49
#> 4 data.table 81.8µs 86.4µs 11228. 16.12KB 8.56
#> 5 base 56.8µs 61µs 15691. 0B 10.7
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
dplyr = bind_rows(babynames, babynames),
svTidy = bind_rows_(babynames, babynames),
svTidy2 = bind_rows_(list(babynames, babynames)),
data.table = rbindlist(list(babynames, babynames)),
base = rbind(babynames, babynames))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 87.62ms 113.19ms 6.93 362MB 5.19
#> 2 svTidy 1.45s 1.45s 0.690 368MB 1.38
#> 3 svTidy2 1.39s 1.39s 0.721 338MB 0.721
#> 4 data.table 42.92ms 68.11ms 15.2 132MB 7.62
#> 5 base 166.14ms 183.71ms 5.53 264MB 3.69
df1 <- data.frame(x = 1:2, y = letters[1:2])
df2 <- data.frame(z = 10:11, w = factor(5:6))
bench::mark(check = FALSE,
dplyr = bind_cols(df1, df2),
svTidy = bind_cols_(df1, df2),
data.table = cbind(df1, df2),
base = cbind(df1, df2))
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 97.1µs 105.2µs 9294. 32.9KB 2.13
#> 2 svTidy 101µs 110.1µs 8870. 13.9KB 4.39
#> 3 data.table 25.8µs 27.9µs 35058. 17.7KB 3.51
#> 4 base 25.7µs 27.9µs 35096. 0B 3.51
data(babynames, package = 'babynames')
bench::mark(check = FALSE,
dplyr = bind_cols(babynames, babynames),
svTidy = bind_cols_(babynames, babynames),
data.table = rbind(babynames, babynames),
base = rbind(babynames, babynames))
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> • `year` -> `year...1`
#> • `sex` -> `sex...2`
#> • `name` -> `name...3`
#> • `n` -> `n...4`
#> • `prop` -> `prop...5`
#> • `year` -> `year...6`
#> • `sex` -> `sex...7`
#> • `name` -> `name...8`
#> • `n` -> `n...9`
#> • `prop` -> `prop...10`
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 67.7ms 68ms 13.9 232MB 1.99
#> 2 svTidy 68.1ms 69ms 9.11 356KB 1.82
#> 3 data.table 171.2ms 207ms 4.96 264MB 4.96
#> 4 base 243.6ms 284ms 3.52 264MB 5.28filter_()
data(mtcars)
bench::mark(check = FALSE,
dplyr = filter(mtcars, mpg > 20),
svTidySE = filter_(mtcars, mtcars$mpg > 20),
svTidyNSE = filter_(mtcars, ~mpg > 20))
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 529.9µs 574.2µs 1730. 208.1KB 2.13
#> 2 svTidySE 32.4µs 35.5µs 27522. 45.5KB 2.75
#> 3 svTidyNSE 36.9µs 40.4µs 24206. 42.7KB 4.84
data(babynames, package = 'babynames')
babynames_dt <- data.table::as.data.table(babynames)
bench::mark(check = FALSE,
dplyr = filter(babynames, n > 1000),
svTidySE = filter_(babynames, babynames$n > 1000),
svTidyNSE = filter_(babynames, ~n > 1000),
data.table = babynames_dt[n > 1000])
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 12.5ms 12.59ms 78.7 31.4MB 16.6
#> 2 svTidySE 5.04ms 5.69ms 177. 16.7MB 17.1
#> 3 svTidyNSE 5.08ms 5.61ms 178. 16.7MB 13.3
#> 4 data.table 8.17ms 8.22ms 121. 16.9MB 13.8