Wiki for the data.table package in R

> install.packages("data.table")   # install
> library(data.table)              # load
> help("data.table")               # read
> update.packages()                # keep up to date

Please feel free to add and edit!

Tips and Tricks

1. How to update/remove columns of a data.table

set.seed(1)
n=10000
DT = data.table(x=rnorm(10*n),y=rnorm(10*n))
 
DT[,z:=x*y] #vanilla update note the := operator 
head(DT,2L)
#         x      y        z
#1: -0.6265 0.7914 -0.49580
#2:  0.1836 0.3922  0.07202
 
DT[,`:=`(u=z+1, v=z-1)] #update several columns at once
head(DT,2L)
#         x      y        z      u      v
#1: -0.6265 0.7914 -0.49580 0.5042 -1.496
#2:  0.1836 0.3922  0.07202 1.0720 -0.928
 
DT[, c("z","u","v"):=NULL] #remove several columns at once
head(DT,2L)
#         x      y
#1: -0.6265 0.7914
#2:  0.1836 0.3922
 
DT[, c('u','v') := list(tmp <- x*y, sqrt(abs(tmp)))] #this allows to update by reference and reuse a column just after it is defined.
head(DT,2L)
#         x      y        u      v
#1: -0.6265 0.7914 -0.49580 0.7041
#2:  0.1836 0.3922  0.07202 0.2684

Do's and Don'ts

1. Don't subset .SD, use the columns directly or use .SDcols

NB : Timings updated with v1.8.2 (improves 3rd case dramatically)

n=10000
DT = data.table(grp=1:n,name=as.character(as.hexmode(1:n)),x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1<-DT[,lapply(.SD[,list(x,y)],sum),by=grp])   
#   user  system elapsed 
# 31.130   0.088  31.288    # bad
system.time(ans2<-DT[,lapply(list(x,y),sum),by=grp])         
#   user  system elapsed 
#  0.284   0.004   0.291    # good
setnames(ans2,names(ans1))
identical(ans1,ans2)
# [1] TRUE
system.time(ans3<-DT[,lapply(.SD,sum),by=grp,.SDcols=c("x","y")])
#   user  system elapsed
#  0.080   0.004   0.085    # even better (prior to v1.8.2 was slower and not recommended, no longer)
identical(ans1,ans3)
# [1] TRUE
tables()
#      NAME    NROW MB COLS         KEY
# [1,] ans1  10,000 1  grp,x,y      grp
# [2,] ans2  10,000 1  grp,x,y      grp
# [3,] ans3  10,000 1  grp,x,y      grp
# [4,] DT   100,000 3  grp,name,x,y grp
# Total: 6MB

NB: The motivation for lapply through a subset of .SD is when there are non-group column(s) in DT (such as name above). Otherwise you could just do DT[,lapply(.SD,sum),by=grp]. Subsetting .SD (as in ans1) is so much slower because a new copy of .SD for those columns is created for each and every group.

2. For speed use := by group, don't transform() by group or cbind() afterwards

As from v1.8.2, := by group has been implemented.

library(data.table)
n=10000
DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
setkey(DT,grp)
system.time(ans1 <- DT[,transform(.SD,x2=x/sum(x),y2=y/sum(y)),by=grp])
# user  system elapsed 
# 5.46    0.00    5.48  # slow
head(ans1,3)
#    grp          x           y        x2         y2
# 1:   1 -0.5848814 -0.41560829 0.6241268  0.5695575
# 2:   1 -0.6314059 -0.49076645 0.6737731  0.6725557
# 3:   1 -1.7694071  0.08860505 1.8881340 -0.1214260
system.time(tt <- DT[,list(x2=x/sum(x),y2=y/sum(y)),by=grp])
# user  system elapsed 
# 0.02    0.00    0.02 (274 times faster!!!)
 
head(tt,3)
#    grp        x2         y2
# 1:   1 0.6241268  0.5695575
# 2:   1 0.6737731  0.6725557
# 3:   1 1.8881340 -0.1214260
system.time(ans2 <- cbind(DT,tt[,list(x2,y2)]))
# user  system elapsed 
# 0.05    0.00    0.05    # very fast to add afterwards in bulk
head(ans2,3)
#     grp          x           y        x2         y2
# 1:   1 -0.5848814 -0.41560829 0.6241268  0.5695575
# 2:   1 -0.6314059 -0.49076645 0.6737731  0.6725557
# 3:   1 -1.7694071  0.08860505 1.8881340 -0.1214260
setkey(ans2,grp)
identical(ans1,ans2)
[1] TRUE
system.time(DT[, c('x2', 'y2') := list(x / sum(x), y / sum(y)), by = grp])
# user  system elapsed 
# 0.07    0.00    0.07 # equivalent to cbind afterwards approach, but more memory efficient
 
# now DT has been updated 
identical(ans1, DT)
# [1] TRUE
# remove new columns to show different approach
DT[, c('x2', 'y2') := NULL]
 
system.time(DT[, `:=`(x2=x / sum(x),y2= y / sum(y)), by = grp])
# user  system elapsed 
#0.04    0.00    0.05 # this is slightly faster 
identical(ans1, DT)
# [1] TRUE

3. Method dispatch takes time.

When called repeatedly, it may be surprising to learn that mean() is inefficient compared to sum(). mean() is S3 generic which means it takes time to dispatch to one of methods(mean), usually mean.default(). Also, mean.default() contains a few lines of R before finally calling .Internal(mean(x)). sum() on the other hand is a primitive function.

For this reason ““data.table”” will optimize calls to ““mean()”” to call .External(Cfastmean, x, FALSE).

From the news

   mean() is now automatically optimized, #1231. This can speed up grouping
      by 20 times when there are a large number of groups. See wiki point 3, which
      is no longer needed to know. Turn off optimization by setting
      options(datatable.optimize=0).

When comparing data.table to other packages, please be aware of this. For examples see this answer on StackOverflow, and this blog post and this post on r-help.

library(data.table)
n<-100000
DT<-data.table(grp1=sample(1:750, n, replace=TRUE),
                 grp2=sample(1:750, n, replace=TRUE),
                 x=rnorm(n),
                 y=rnorm(n))
system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2), verbose = TRUE])
# Detected that j uses these columns: x,y 
# Optimized j from 'list(mean(x), mean(y))' to 'list(.External(Cfastmean, x, FALSE), .External(Cfastmean, y, FALSE))'
# Starting dogroups ... done dogroups in 0.2 secs
# user  system elapsed 
# 0.25    0.00    0.25 
 
# without optimization
options(datatable.optimize = 0)
system.time(ans2<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose = TRUE])
# Finding groups (bysameorder=FALSE) ... done in 0.04secs. bysameorder=FALSE and o__ is length 100000
# Detected that j uses these columns: x,y 
# Starting dogroups ... done dogroups in 1.61 secs
# user  system elapsed 
# 1.64    0.00    1.65    # 6.6 times faster
identical(ans1,ans2)
# [1] TRUE
 
 
 
 

median() is another S3 generic with overhead that can be sped up a lot using a similar technique. However, sum(), length(), var() and many other functions don’t suffer the same performance issue :

system.time(DT[,list(sum(x),sum(y)),by=list(grp1,grp2)])
#   user  system elapsed 
#   0.19    0.00    0.19    # not much slower than optimized mean

4. Don't coerce j to list, use list() directly

 > n=10000
 > DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
 > setkey(DT,grp)
 > system.time(ans1<-DT[,as.list(cbind(sum(x),sum(y))),by=grp])
   user  system elapsed 
  1.472   0.016   1.494    # bad
 > system.time(ans2<-DT[,data.table(sum(x),sum(y)),by=grp])
   user  system elapsed 
 18.494   0.308  18.914    # worse
 > system.time(ans3<-DT[,data.frame(sum(x),sum(y)),by=grp])
   user  system elapsed 
 40.502   1.136  41.807    # terrible
 > colnames(ans3)=colnames(ans1)
 > system.time(ans4<-DT[,list(sum(x),sum(y)),by=grp])
   user  system elapsed 
  0.176   0.004   0.181    # best
 > identical(ans1,ans2)
[1] TRUE
 > identical(ans1,ans3)
[1] TRUE
 > identical(ans1,ans4)
[1] TRUE

5. Don't use colSums, use lapply on .SD

 > n=10000
 > DT = data.table(grp=1:n,x=rnorm(10*n),y=rnorm(10*n))
 > setkey(DT,grp)
 > system.time(ans1<-DT[,as.list(colSums(.SD)),by=grp])
    user  system elapsed 
  1.296   0.000   1.298 #bad
 > system.time(ans2<-DT[,lapply(.SD,sum),by=grp])
  user  system elapsed 
  0.020   0.000   0.021 #best
 > identical(ans1,ans2)
[1] TRUE
 > system.time(ans3<-DT[,list(x=sum(x),y=sum(y)),by=grp])
    user  system elapsed 
  0.020   0.000   0.022  #similar but longer to write
 > identical(ans1,ans3)
[1] TRUE
 
packages/cran/data.table.txt · Last modified: 2013/04/30 by mnel
 
Recent changes RSS feed R Wiki powered by Driven by DokuWiki and optimized for Firefox Creative Commons License