fast R lookup table

To match two data.frames on multiple columns you can use from base merge or match in combination with interaction, paste or use a list. It is also possible to map two integers to one, in a unique and deterministic way. A simple extension is the fastmatch library which can be faster than match from base. Also dplyr or data.table can be an option. Have also a look at: Matching more than 2 conditions, How to join (merge) data frames and Fast single item lookup.

library(fastmatch)
library(dplyr)
library(microbenchmark)
microbenchmark(times = 10L, setup = gc(), check = "equivalent"
 , merge = merge(lookMeUp, lookupTable, all.x=TRUE, sort=FALSE)
 , dplyr = left_join(lookMeUp, lookupTable, by = c("i1", "i2"))
 , inter = cbind(lookMeUp, lookupTable[match(interaction(lookMeUp[c("i1","i2")])
                 , interaction(lookupTable[c("i1","i2")])), 3:4])
 , paste = cbind(lookMeUp, lookupTable[match(paste(lookMeUp$i1, lookMeUp$i2)
                 , paste(lookupTable$i1, lookupTable$i2)), 3:4])
 , int = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
                 , lookupTable$i1 + lookupTable$i2 * max(lookupTable$i1)), 3:4])
 , fInter = cbind(lookMeUp, lookupTable[fmatch(interaction(lookMeUp[c("i1","i2")])
                 , interaction(lookupTable[c("i1","i2")])), 3:4])
 , fPaste = cbind(lookMeUp, lookupTable[fmatch(paste(lookMeUp$i1, lookMeUp$i2)
                  , paste(lookupTable$i1, lookupTable$i2)), 3:4])
 , fint = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * max(lookMeUp$i1)
                 , lookupTable$i1 + lookupTable$i2 * max(lookMeUp$i1)), 3:4])
)
#Unit: milliseconds
#   expr        min         lq       mean     median         uq        max neval
#  merge 2547.72575 2564.72138 2590.03400 2578.14307 2585.01870 2735.23435    10
#  dplyr  690.55046  695.56161  703.01335  703.95085  707.32141  714.00890    10
#  inter  511.86378  514.36418  528.73905  529.14331  535.33359  552.20183    10
#  paste  750.01340  763.84494  942.47309  777.73232 1273.83380 1377.00192    10
#    int   71.56913   72.15233   73.27748   72.92613   73.89630   77.01510    10
# fInter  447.82012  450.00472  459.51196  455.82473  464.85767  491.52366    10
# fPaste  713.68824  719.60794  796.94680  726.70971  788.36997 1316.64071    10
#   fint   59.04541   59.13039   60.95638   60.59758   62.58539   63.65308    10

Instead of creating the unique identifier each time you make a look up, you can store it in the lookup table, what will make the lookup faster but you have an overhead in creating it. You can also sort the lookup table by this identifier which will allow accessing the data line without using match but this method will add not defined rows in case there are some combinations missing, what will be equivalent in creation a matrix or array. You can also use the build in hash for looking up variables in an environment. Also the binary search from findInterval can be used.

system.time({maxLTi1 <- max(lookupTable$i1); lookupTable$id <- lookupTable$i1 + lookupTable$i2 * maxLTi1})
#       User      System verstrichen 
#      0.006       0.000       0.006 

system.time(fmatch(c(lookupTable$id[1], 0), lookupTable$id))  #Create Hash
#       User      System verstrichen 
#      0.056       0.000       0.056 
#system.time(fmatch(lookupTable$id[1], lookupTable$id))  #Create Hash in case you have only matches
#       User      System verstrichen 
#      0.016       0.004       0.020 

system.time({
lookupTableS <- lookupTable[0,]
lookupTableS[lookupTable$id,] <- lookupTable #Sort Table with gaps
})
#       User      System verstrichen 
#      0.080       0.011       0.091 

system.time({
lookupTableS2 <- lookupTable[order(lookupTable$id),] #Sort Table
})
#       User      System verstrichen 
#      0.074       0.000       0.074 

library(Matrix)
system.time({ #Sorted Sparse Vector
  i <- order(lookupTable$id)
  lookupTableS3 <- sparseVector(i, lookupTable$id[i], max(lookupTable$id))})
#       User      System verstrichen 
#      0.057       0.008       0.065 

system.time(lupEnv <- list2env(setNames(as.list(seq_len(nrow(lookupTable))), paste(lookupTable$i1, lookupTable$i2))))
#       User      System verstrichen 
#      4.824       0.056       4.880 

library(data.table);
lookupTableDT <- as.data.table(copy(lookupTable))
lookMeUpDT <- as.data.table(copy(lookMeUp))
system.time(setkey(lookupTableDT, i1, i2))
#       User      System verstrichen 
#      0.094       0.000       0.027 

lookMeUpDT$id <- lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
lookupTableDTId <- as.data.table(copy(lookupTable))
system.time(setkey(lookupTableDTId, id))
#       User      System verstrichen 
#      0.091       0.000       0.026 

lookMeUpDTId <- copy(lookMeUpDT)
lookMeUpDTId$row <- seq_len(nrow(lookMeUpDTId))
setkey(lookMeUpDTId, id)

microbenchmark(times = 10L, setup = gc(), check = "equivalent"
 , int = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * max(lookupTable$i1)
                 , lookupTable$i1 + lookupTable$i2 * max(lookupTable$i1)), 3:4])
 , fint = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * max(lookMeUp$i1)
                 , lookupTable$i1 + lookupTable$i2 * max(lookMeUp$i1)), 3:4])
 , id = cbind(lookMeUp, lookupTable[match(lookMeUp$i1 + lookMeUp$i2 * maxLTi1
                 , lookupTable$id), 3:4])
 , sparid = {i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
   j <- i
   j[i>0] <- as.vector(lookupTableS3[i[i>0]])
   cbind(lookMeUp, lookupTable[ifelse(j>0,j,NA), 3:4])}
 , DT = merge(lookMeUpDT[,1:3], lookupTableDT[,1:4], by=c("i1", "i2"), all.x=TRUE, sort = FALSE)
 , DTid = merge(lookMeUpDT, lookupTableDTId[,-2:-1], by=c("id"), all.x=TRUE, sort = FALSE)[,-1]
 , DiIdKey = merge(lookMeUpDTId, lookupTableDTId[,-2:-1], all.x=TRUE, sort = FALSE)[order(row),][,c(-1,-5)]
 , findInt = {i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
    j  <- findInterval(i, lookupTableS2$id)
    j[j==0]  <- NA
    j[i != lookupTableS2$id[j]] <- NA
    cbind(lookMeUp, lookupTableS2[j, 3:4])}
 , envir = cbind(lookMeUp, lookupTable[vapply(paste(lookMeUp$i1, lookMeUp$i2), function(i) {x  <- lupEnv[[i]]; if(is.null(x)) NA else x}, 1), 3:4])
 , fid = cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * maxLTi1
                 , lookupTable$id), 3:4])
 , sid = cbind(lookMeUp, lookupTableS[ifelse(lookMeUp$i1 > 0, lookMeUp$i1 + lookMeUp$i2 * maxLTi1, NA), 3:4])
)
#Unit: microseconds
#    expr       min        lq       mean     median        uq       max neval
#     int 75167.977 76446.819 77817.3349 77958.9650 78649.235 80656.715    10
#    fint 63332.436 63948.769 64574.8881 64194.2765 64942.559 66808.193    10
#      id 68198.639 69293.551 70477.6062 70223.0505 71393.354 74951.007    10
#  sparid  9181.928  9217.312  9552.0241  9478.8475  9561.917 10895.649    10
#      DT  4990.075  5000.857  5125.6716  5051.4970  5157.057  5547.220    10
#    DTid  4167.229  4189.703  4250.0804  4232.8955  4289.718  4440.924    10
# DiIdKey  4547.589  4582.915  4626.9514  4597.6790  4634.311  4867.630    10
# findInt  2795.560  2813.100  2854.7069  2815.4890  2857.084  3097.120    10
#   envir   526.971   530.459   537.5767   532.9755   546.402   551.231    10
#     fid   424.790   425.218   433.7295   433.3335   441.673   444.026    10
#     sid   436.135   439.688   445.1770   441.5705   445.331   464.685    10

#In case order and columns need not be like the others
microbenchmark(times = 10L, setup = gc(), unit = "us",
 DiIdKey = merge(lookMeUpDTId, lookupTableDTId, all.x=TRUE, sort = FALSE))
#Unit: microseconds
#    expr      min      lq     mean   median       uq     max neval
# DiIdKey 1692.629 1706.14 1719.556 1717.142 1722.067 1778.88    10

Creating a unique identifier and store it in the lookup table and using fmatch could be recommended. In pure base the lookup table could be sorted by the ID and missing combinations will be filled with NA what allows direct access to the matching rows without using match. Alternatively the lookup can be done in an environment where the build in hash search is used but this has much overhead. Also using findInterval shows good results.

In case the columns are not (positive) integer cast them to factor and use their integer values.

Data:

set.seed(7)
sqrtN  <- 1e3
lookupTable <- data.frame(expand.grid(i1=seq_len(sqrtN), i2=seq_len(sqrtN)), v1=seq_len(sqrtN*sqrtN))[sample(sqrtN*sqrtN),]
lookupTable$v2  <- seq_len(sqrtN*sqrtN)

lookMeUp <- rbind(lookupTable[sample(nrow(lookupTable), 10), 1:2], c(-1, -1))
lookMeUp$vA <- letters[1:nrow(lookMeUp)]

Timings of lookuptable with 5e7 rows:

sqrtN  <- 7.1e3
lookupTable <- data.frame(expand.grid(i1=seq_len(sqrtN), i2=seq_len(sqrtN)), v1=seq_len(sqrtN*sqrtN))[sample(sqrtN*sqrtN),]
lookupTable$v2  <- seq_len(sqrtN*sqrtN)

lookMeUp <- rbind(lookupTable[sample(nrow(lookupTable), 10), 1:2], c(-1, -1))
lookMeUp$vA <- letters[1:nrow(lookMeUp)]

system.time({maxLTi1 <- max(lookupTable$i1); lookupTable$id <- lookupTable$i1 + lookupTable$i2 * maxLTi1})
#       User      System verstrichen 
#      0.312       0.016       0.329 

system.time(lookupTable <- lookupTable[order(lookupTable$id),]) #For findIntervall
#       User      System verstrichen 
#      6.786       0.120       6.905 

system.time({
i <- lookMeUp$i1 + lookMeUp$i2 * maxLTi1
j  <- findInterval(i, lookupTable$id)
j[j==0]  <- NA
j[i != lookupTable$id[j]] <- NA
cbind(lookMeUp, lookupTable[j, 3:4])
})
#       User      System verstrichen 
#      0.099       0.048       0.147 

system.time(fmatch(c(lookupTable$id[1], 0), lookupTable$id)) #Create Hash
#       User      System verstrichen 
#      2.642       0.120       2.762 

system.time(cbind(lookMeUp, lookupTable[fmatch(lookMeUp$i1 + lookMeUp$i2 * maxLTi1, lookupTable$id), 3:4]))
#       User      System verstrichen 
#          0           0           0 

Tags:

R