# Fill NAs in R with zero if the next valid data point is more than 2 intervals away

Update -

Here's probably one of the simplest and fastest solutions (Thanks to answer from G. Grothendieck). Simply knowing whether the value is NA on either side of any NA is sufficient information. Therefore, using lead and lag from dplyr package -

na2zero <- function(x) {
x[is.na(lag(x, 1, 0)) & is.na(lead(x, 1, 0)) & is.na(x)] <- 0
x
}

na2zero(x = c(3, 4, NA, NA, NA, 3, 3))
  3  4 NA  0 NA  3  3

na2zero(x = c(3, 4, NA, NA, NA, NA, NA, 3, 3))
  3  4 NA  0  0  0 NA  3  3

na2zero(x = c(3, 4, NA, NA, NA, 3, 3, NA, NA, 1, NA, 0, 0, rep(NA, 4L)))
  3  4 NA  0 NA  3  3 NA NA  1 NA  0  0 NA  0  0 NA


Here's one way using rle and replace from base R. This method turns every NA, that is not an endpoint in the running length, into a 0 -

na2zero <- function(x) {
run_lengths <- rle(is.na(x))\$lengths
replace(x,
sequence(run_lengths) != 1 &
sequence(run_lengths) != rep(run_lengths, run_lengths) &
is.na(x),
0)
}

na2zero(x = c(3, 4, NA, NA, NA, 3, 3))
  3  4 NA  0 NA  3  3

na2zero(x = c(3, 4, NA, NA, NA, NA, NA, 3, 3))
  3  4 NA  0  0  0 NA  3  3


Updated Benchmarks -

set.seed(2)
x <- c(3, 4, NA, NA, NA, 3, 3)
x <- sample(x, 1e5, T)

microbenchmark(
Rui(x),
Shree_old(x), Shree_new(x),
markus(x),
IceCreamT(x),
Uwe1(x), Uwe2(x), Uwe_Reduce(x),
Grothendieck(x),
times = 50
)

all.equal(Shree_dplyr(x), Rui(x)) #  TRUE
all.equal(Shree_dplyr(x), Shree_rle(x)) #  TRUE
all.equal(Shree_dplyr(x), markus(x)) #  TRUE
all.equal(Shree_dplyr(x), Uwe1(x)) #  TRUE
all.equal(Shree_dplyr(x), Uwe2(x)) #  TRUE
all.equal(Shree_dplyr(x), Uwe_Reduce(x)) #  TRUE
all.equal(Shree_dplyr(x), Grothendieck(x)) #  TRUE

Unit: milliseconds
expr        min         lq        mean     median          uq        max neval
Rui(x) 286.026540 307.586604  342.620266 318.404731  363.844258  518.03330    50
Shree_rle(x)  51.556489  62.038875   85.348031  65.012384   81.882141  327.57514    50
Shree_dplyr(x)   3.996918   4.258248   17.210709   6.298946   10.335142  207.14732    50
markus(x) 853.513854 885.419719 1001.450726 919.930389 1018.353847 1642.25435    50
IceCreamT(x)  12.162079  13.773873   22.555446  15.021700   21.271498  199.08993    50
Uwe1(x) 162.536980 183.566490  225.801038 196.882049  269.020395  439.17737    50
Uwe2(x)  83.582360  93.136277  115.608342  99.165997  115.376903  309.67290    50
Uwe_Reduce(x)   1.732195   1.871940    4.215195   2.016815    4.842883   25.91542    50
Grothendieck(x) 620.814291 688.107779  767.749387 746.699435  850.442643  982.49094    50


PS: Do check out TiredSquirell's answer which seems like a base version of Uwe's lead-lag answer but is somewhat faster (not benchmarked above).

Maybe there are simpler solutions but this one works.

na2zero <- function(x){
ave(x, cumsum(abs(c(0, diff(is.na(x))))), FUN = function(y){
if(anyNA(y)){
if(length(y) > 2) y[-c(1, length(y))] <- 0
}
y
})
}

na2zero(x)
#  3  4 NA  0 NA  3  3

X <- list(x, c(x, x), c(3, 4, NA, NA, NA, NA, 3, 3))
lapply(X, na2zero)


Here's a data.table option

library(data.table)

na0_dt <- function(x){
replace(x, rowid(r <- rleid(xna <- is.na(x))) > 1 & rev(rowid(rev(r))) > 1 & xna, 0)
}