-
Notifications
You must be signed in to change notification settings - Fork 1k
Closed
Description
When using by with many small groups, runtime can be slow. For some situations like cumsum and frank, we could perform it on the whole vector first and use the last value from the previous group to thus avoiding the use of by. Hence, proposing to add a function that carries the value from previous group to the next group (similar in spirit to rleid and rowid). I suspect it might help with some recursive algo.
R implementation:
#fill group using last value from previous group
#to be placed into a function, similarly like rleid and rowid
#not defensive yet...major improvements required...ideas appreciated
laggrpv <- function(g, v, N) {
ix <- which(diff(g)!=0L) + 1L
f <- replace(rep(NA_integer_, N), ix, v[ix-1L])
k <- nafill(nafill(f, "locf"), fill=0)
}
Timing code for cumsum:
library(data.table) #Win x64 R-3.6.1 data.table_1.12.8
set.seed(0L)
nr <- 1e6L#8
DT <- data.table(ID=rep(1L:(nr/2L), each=4L), VAL=1L:nr)
DT0 <- copy(DT)
DT1 <- copy(DT)
f <- cumsum
mtd0 <- function() {
DT0[, CS := f(VAL), ID]
}
#fill group using last value from previous group
#to be placed into a function, similarly like rleid and rowid
laggrpv <- function(g, v, N) {
ix <- which(diff(g)!=0L) + 1L
f <- replace(rep(NA_integer_, N), ix, v[ix-1L])
k <- nafill(nafill(f, "locf"), fill=0)
}
mtd1 <- function() {
DT1[, CS := {
cs <- f(as.double(VAL))
k <- laggrpv(ID, cs, .N)
cs - k
}]
}
microbenchmark::microbenchmark(times=3L, mtd0(), mtd1())
fsetequal(DT0, DT1)
#[1] TRUE
timings:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0() 176.7456 196.7197 203.8157 216.6937 217.3507 218.0077 3
mtd1() 125.2726 127.1587 131.0547 129.0448 133.9458 138.8468 3
Timing code for frank(..., ties.method="dense")
f <- function(x) frank(x, ties.method="dense")
mtd0 <- function() {
DT0[, RNK := f(VAL), ID]
}
#fill group using last value from previous group
#to be placed into a function, similarly like rleid and rowid
laggrpv <- function(g, v, N) {
ix <- which(diff(g)!=0L) + 1L
f <- replace(rep(NA_integer_, N), ix, v[ix-1L])
k <- nafill(nafill(f, "locf"), fill=0)
}
mtd1 <- function() {
DT1[, RNK := {
rnk <- rleid(ID, VAL)
k <- laggrpv(ID, rnk, .N)
rnk - k
}]
}
microbenchmark::microbenchmark(times=1L, mtd0(), mtd1())
fsetequal(DT0, DT1)
timing:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0() 149920.6706 149920.6706 149920.6706 149920.6706 149920.6706 149920.6706 1
mtd1() 73.5902 73.5902 73.5902 73.5902 73.5902 73.5902 1
Metadata
Metadata
Assignees
Labels
No labels