Can dplyr package be used for conditional mutating?

Use ifelse

df %>%
  mutate(g = ifelse(a == 2 | a == 5 | a == 7 | (a == 1 & b == 4), 2,
               ifelse(a == 0 | a == 1 | a == 4 | a == 3 |  c == 4, 3, NA)))

Added - if_else: Note that in dplyr 0.5 there is an if_else function defined so an alternative would be to replace ifelse with if_else; however, note that since if_else is stricter than ifelse (both legs of the condition must have the same type) so the NA in that case would have to be replaced with NA_real_ .

df %>%
  mutate(g = if_else(a == 2 | a == 5 | a == 7 | (a == 1 & b == 4), 2,
               if_else(a == 0 | a == 1 | a == 4 | a == 3 |  c == 4, 3, NA_real_)))

Added - case_when Since this question was posted dplyr has added case_when so another alternative would be:

df %>% mutate(g = case_when(a == 2 | a == 5 | a == 7 | (a == 1 & b == 4) ~ 2,
                            a == 0 | a == 1 | a == 4 | a == 3 |  c == 4 ~ 3,
                            TRUE ~ NA_real_))

Added - arithmetic/na_if If the values are numeric and the conditions (except for the default value of NA at the end) are mutually exclusive, as is the case in the question, then we can use an arithmetic expression such that each term is multiplied by the desired result using na_if at the end to replace 0 with NA.

df %>%
  mutate(g = 2 * (a == 2 | a == 5 | a == 7 | (a == 1 & b == 4)) +
             3 * (a == 0 | a == 1 | a == 4 | a == 3 |  c == 4),
         g = na_if(g, 0))

Since you ask for other better ways to handle the problem, here's another way using data.table:

require(data.table)
setDT(df)
df[a %in% c(0,1,3,4) | c == 4, g := 3L]
df[a %in% c(2,5,7) | (a==1 & b==4), g := 2L]

Note the order of conditional statements is reversed to get g correctly. There's no copy of g made, even during the second assignment - it's replaced in-place.

On larger data this would have better performance than using nested if-else, as it can evaluate both 'yes' and 'no' cases, and nesting can get harder to read/maintain IMHO.


Here's a benchmark on relatively bigger data:

# NB: benchmark timings are as of R 3.1.0, data.table v1.9.2
require(data.table)
require(dplyr)
DT <- setDT(lapply(1:6, function(x) sample(7, 1e7, TRUE)))
setnames(DT, letters[1:6])
# > dim(DT) 
# [1] 10000000        6
DF <- as.data.frame(DT)

DT_fun <- function(DT) {
    DT[(a %in% c(0,1,3,4) | c == 4), g := 3L]
    DT[a %in% c(2,5,7) | (a==1 & b==4), g := 2L]
}
DPLYR_fun <- function(DF) {
    mutate(DF, g = ifelse(a %in% c(2,5,7) | (a==1 & b==4), 2L, 
           ifelse(a %in% c(0,1,3,4) | c==4, 3L, NA_integer_)))
}

BASE_fun <- function(DF) { # R v3.1.0
    transform(DF, g = ifelse(a %in% c(2,5,7) | (a==1 & b==4), 2L, 
              ifelse(a %in% c(0,1,3,4) | c==4, 3L, NA_integer_)))
}

system.time(ans1 <- DT_fun(DT))
#   user  system elapsed 
#  2.659   0.420   3.107 

system.time(ans2 <- DPLYR_fun(DF))
#   user  system elapsed 
# 11.822   1.075  12.976 

system.time(ans3 <- BASE_fun(DF))
#   user  system elapsed 
# 11.676   1.530  13.319 

identical(as.data.frame(ans1), as.data.frame(ans2))
# [1] TRUE

identical(as.data.frame(ans1), as.data.frame(ans3))
# [1] TRUE

Not sure if this is an alternative you'd asked for, but I hope it helps.