> install.packages("data.table") # install
> library(data.table) # load
> help("data.table") # read
> update.packages() # keep up to date
Please feel free to add and edit!
set.seed(1) n=10000 DT = data.table(x=rnorm(10*n),y=rnorm(10*n)) DT[,z:=x*y] #vanilla update note the := operator head(DT,2L) # x y z #1: -0.6265 0.7914 -0.49580 #2: 0.1836 0.3922 0.07202 DT[,`:=`(u=z+1, v=z-1)] #update several columns at once head(DT,2L) # x y z u v #1: -0.6265 0.7914 -0.49580 0.5042 -1.496 #2: 0.1836 0.3922 0.07202 1.0720 -0.928 DT[, c("z","u","v"):=NULL] #remove several columns at once head(DT,2L) # x y #1: -0.6265 0.7914 #2: 0.1836 0.3922 DT[, c('u','v') := list(tmp <- x*y, sqrt(abs(tmp)))] #this allows to update by reference and reuse a column just after it is defined. head(DT,2L) # x y u v #1: -0.6265 0.7914 -0.49580 0.7041 #2: 0.1836 0.3922 0.07202 0.2684
NB : Timings updated with v1.8.2 (improves 3rd case dramatically)
n=10000 DT = data.table(grp=1:n,name=as.character(as.hexmode(1:n)),x=rnorm(10*n),y=rnorm(10*n)) setkey(DT,grp) system.time(ans1<-DT[,lapply(.SD[,list(x,y)],sum),by=grp]) # user system elapsed # 31.130 0.088 31.288 # bad system.time(ans2<-DT[,lapply(list(x,y),sum),by=grp]) # user system elapsed # 0.284 0.004 0.291 # good setnames(ans2,names(ans1)) identical(ans1,ans2) # [1] TRUE system.time(ans3<-DT[,lapply(.SD,sum),by=grp,.SDcols=c("x","y")]) # user system elapsed # 0.080 0.004 0.085 # even better (prior to v1.8.2 was slower and not recommended, no longer) identical(ans1,ans3) # [1] TRUE tables() # NAME NROW MB COLS KEY # [1,] ans1 10,000 1 grp,x,y grp # [2,] ans2 10,000 1 grp,x,y grp # [3,] ans3 10,000 1 grp,x,y grp # [4,] DT 100,000 3 grp,name,x,y grp # Total: 6MB
NB: The motivation for lapply through a subset of .SD is when there are non-group column(s) in DT (such as name above). Otherwise you could just do DT[,lapply(.SD,sum),by=grp]. Subsetting .SD (as in ans1) is so much slower because a new copy of .SD for those columns is created for each and every group.
As from v1.8.2, := by group has been implemented.
library(data.table) n=10000 DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n)) setkey(DT,grp) system.time(ans1 <- DT[,transform(.SD,x2=x/sum(x),y2=y/sum(y)),by=grp]) # user system elapsed # 5.46 0.00 5.48 # slow head(ans1,3) # grp x y x2 y2 # 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575 # 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557 # 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260 system.time(tt <- DT[,list(x2=x/sum(x),y2=y/sum(y)),by=grp]) # user system elapsed # 0.02 0.00 0.02 (274 times faster!!!) head(tt,3) # grp x2 y2 # 1: 1 0.6241268 0.5695575 # 2: 1 0.6737731 0.6725557 # 3: 1 1.8881340 -0.1214260 system.time(ans2 <- cbind(DT,tt[,list(x2,y2)])) # user system elapsed # 0.05 0.00 0.05 # very fast to add afterwards in bulk head(ans2,3) # grp x y x2 y2 # 1: 1 -0.5848814 -0.41560829 0.6241268 0.5695575 # 2: 1 -0.6314059 -0.49076645 0.6737731 0.6725557 # 3: 1 -1.7694071 0.08860505 1.8881340 -0.1214260 setkey(ans2,grp) identical(ans1,ans2) [1] TRUE system.time(DT[, c('x2', 'y2') := list(x / sum(x), y / sum(y)), by = grp]) # user system elapsed # 0.07 0.00 0.07 # equivalent to cbind afterwards approach, but more memory efficient # now DT has been updated identical(ans1, DT) # [1] TRUE # remove new columns to show different approach DT[, c('x2', 'y2') := NULL] system.time(DT[, `:=`(x2=x / sum(x),y2= y / sum(y)), by = grp]) # user system elapsed #0.04 0.00 0.05 # this is slightly faster identical(ans1, DT) # [1] TRUE
When called repeatedly, it may be surprising to learn that mean() is inefficient compared to sum(). mean() is S3 generic which means it takes time to dispatch to one of methods(mean), usually mean.default(). Also, mean.default() contains a few lines of R before finally calling .Internal(mean(x)). sum() on the other hand is a primitive function.
For this reason ““data.table”” will optimize calls to ““mean()”” to call .External(Cfastmean, x, FALSE).
From the news
mean() is now automatically optimized, #1231. This can speed up grouping
by 20 times when there are a large number of groups. See wiki point 3, which
is no longer needed to know. Turn off optimization by setting
options(datatable.optimize=0).
When comparing data.table to other packages, please be aware of this. For examples see this answer on StackOverflow, and this blog post and this post on r-help.
library(data.table) n<-100000 DT<-data.table(grp1=sample(1:750, n, replace=TRUE), grp2=sample(1:750, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2), verbose = TRUE]) # Detected that j uses these columns: x,y # Optimized j from 'list(mean(x), mean(y))' to 'list(.External(Cfastmean, x, FALSE), .External(Cfastmean, y, FALSE))' # Starting dogroups ... done dogroups in 0.2 secs # user system elapsed # 0.25 0.00 0.25 # without optimization options(datatable.optimize = 0) system.time(ans2<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose = TRUE]) # Finding groups (bysameorder=FALSE) ... done in 0.04secs. bysameorder=FALSE and o__ is length 100000 # Detected that j uses these columns: x,y # Starting dogroups ... done dogroups in 1.61 secs # user system elapsed # 1.64 0.00 1.65 # 6.6 times faster identical(ans1,ans2) # [1] TRUE
median() is another S3 generic with overhead that can be sped up a lot using a similar technique. However, sum(), length(), var() and many other functions don’t suffer the same performance issue :
system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)]) # user system elapsed # 0.19 0.00 0.19 # not much slower than optimized mean
> n=10000 > DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n)) > setkey(DT,grp) > system.time(ans1<-DT[,as.list(cbind(sum(x),sum(y))),by=grp]) user system elapsed 1.472 0.016 1.494 # bad > system.time(ans2<-DT[,data.table(sum(x),sum(y)),by=grp]) user system elapsed 18.494 0.308 18.914 # worse > system.time(ans3<-DT[,data.frame(sum(x),sum(y)),by=grp]) user system elapsed 40.502 1.136 41.807 # terrible > colnames(ans3)=colnames(ans1) > system.time(ans4<-DT[,list(sum(x),sum(y)),by=grp]) user system elapsed 0.176 0.004 0.181 # best > identical(ans1,ans2) [1] TRUE > identical(ans1,ans3) [1] TRUE > identical(ans1,ans4) [1] TRUE
> n=10000 > DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n)) > setkey(DT,grp) > system.time(ans1<-DT[,as.list(colSums(.SD)),by=grp]) user system elapsed 1.296 0.000 1.298 #bad > system.time(ans2<-DT[,lapply(.SD,sum),by=grp]) user system elapsed 0.020 0.000 0.021 #best > identical(ans1,ans2) [1] TRUE > system.time(ans3<-DT[,list(x=sum(x),y=sum(y)),by=grp]) user system elapsed 0.020 0.000 0.022 #similar but longer to write > identical(ans1,ans3) [1] TRUE