Skip to content

implement guniqueN #1120

@jangorecki

Description

@jangorecki

Most recent data.table. Not always, but quite often...

library(data.table)
library(microbenchmark)
N <- 1e6
DT <- data.table(x = sample(1e5,N,TRUE), y = sample(1e2,N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: milliseconds
#                                         expr      min       lq     mean   median       uq      max neval
#                   DT[, length(unique(x)), y] 85.58602 85.58602 85.58602 85.58602 85.58602 85.58602     1
#                          DT[, uniqueN(x), y] 92.71877 92.71877 92.71877 92.71877 92.71877 92.71877     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"] 97.51024 97.51024 97.51024 97.51024 97.51024 97.51024     1
N <- 1e7
DT <- data.table(x = sample(1e5,N,TRUE), y = sample(1e2,N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: milliseconds
#                                         expr       min        lq      mean    median        uq       max neval
#                   DT[, length(unique(x)), y] 1642.5212 1642.5212 1642.5212 1642.5212 1642.5212 1642.5212     1
#                          DT[, uniqueN(x), y]  843.0670  843.0670  843.0670  843.0670  843.0670  843.0670     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"]  804.7881  804.7881  804.7881  804.7881  804.7881  804.7881     1
N <- 1e7
DT <- data.table(x = sample(1e6,N,TRUE), y = sample(1e5,N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: seconds
#                                         expr      min       lq     mean   median       uq      max neval
#                   DT[, length(unique(x)), y] 3.025365 3.025365 3.025365 3.025365 3.025365 3.025365     1
#                          DT[, uniqueN(x), y] 4.734323 4.734323 4.734323 4.734323 4.734323 4.734323     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"] 5.905721 5.905721 5.905721 5.905721 5.905721 5.905721     1
N <- 1e7
DT <- data.table(x = sample(1e3,N,TRUE), y = sample(1e5,N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: seconds
#                                         expr      min       lq     mean   median       uq      max neval
#                   DT[, length(unique(x)), y] 2.906589 2.906589 2.906589 2.906589 2.906589 2.906589     1
#                          DT[, uniqueN(x), y] 4.731925 4.731925 4.731925 4.731925 4.731925 4.731925     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"] 7.084020 7.084020 7.084020 7.084020 7.084020 7.084020     1
N <- 1e7
DT <- data.table(x = sample(1e6,N,TRUE), y = sample(1e2,N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: milliseconds
#                                         expr      min       lq     mean   median       uq      max neval
#                   DT[, length(unique(x)), y] 1331.244 1331.244 1331.244 1331.244 1331.244 1331.244     1
#                          DT[, uniqueN(x), y]  998.040  998.040  998.040  998.040  998.040  998.040     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"] 1096.867 1096.867 1096.867 1096.867 1096.867 1096.867     1
N <- 1e7
DT <- data.table(x = sample(letters,N,TRUE), y = sample(letters[1:10],N,TRUE))
microbenchmark(times=1L,
               DT[, length(unique(x)),y],
               DT[, uniqueN(x),y],
               DT[, uniqueN(.SD), by="y", .SDcols="x"])
# Unit: milliseconds
#                                         expr       min        lq      mean    median        uq       max neval
#                   DT[, length(unique(x)), y] 1304.4865 1304.4865 1304.4865 1304.4865 1304.4865 1304.4865     1
#                          DT[, uniqueN(x), y]  573.8628  573.8628  573.8628  573.8628  573.8628  573.8628     1
#  DT[, uniqueN(.SD), by = "y", .SDcols = "x"]  528.3269  528.3269  528.3269  528.3269  528.3269  528.3269     1

Related SO: http://stackoverflow.com/a/29684533/2490497

R version 3.1.3 (2015-03-09)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 14.04.2 LTS

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_DK.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.9.5     microbenchmark_1.4-2

loaded via a namespace (and not attached):
 [1] bitops_1.0-6     chron_2.3-45     colorspace_1.2-4 devtools_1.7.0   digest_0.6.8     evaluate_0.5.5   formatR_1.0      ggplot2_1.0.0    grid_3.1.3      
[10] gtable_0.1.2     httr_0.6.1       knitr_1.8        MASS_7.3-37      munsell_0.4.2    plyr_1.8.1       proto_0.3-10     Rcpp_0.11.4      RCurl_1.95-4.5  
[19] reshape2_1.4.1   scales_0.2.4     stringr_0.6.2    tools_3.1.3    

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions