Count Trailing and Leading NA for each vector

1) Cumsum - An option would be to create a logical vector with cumsum on the presence of non-NA elements and get the sum (base R - No packages used)

f1 <- function(vec, trail = FALSE) {
  if(trail) {
     vec <- rev(vec)
    }
    sum(!cumsum(!is.na(vec)))
 }

f1(v1)
#[1] 3
f1(v1, TRUE)
#[1] 0

sapply(mget(paste0("v", 1:3)), f1)
#  v1 v2 v3 
# 3  2  0 
sapply(mget(paste0("v", 1:3)), f1, TRUE)  
#  v1 v2 v3 
#  0  2  5 

2 rle - Another base R option is rle (No packages are used)

with(rle(is.na(v2)), lengths[values & seq_along(values) %in% c(1, length(values))])

Wrapper over which.max:

leading.nas <- function(x) {
  if (length(x) == 0) {
   0L 
  }
  else {
    which.min(!is.na(x)) - 1
  }
}

This turns out to be similar to @Bulat's solution

count_nas <- function(x) {
  nas <- is.na(x)

  if (sum(nas) == length(x)) {
    warning('all elements were NA')
    return(c(start_na = NA_integer_, end_na = NA_integer_))
  }

  c(start_na = which.min(nas) - 1,
    end_na = which.min(rev(nas)) - 1)
}

count_nas(v1)
#start_na   end_na 
#       3        0 

sapply(list(v1,v2,v3), count_nas)
#         [,1] [,2] [,3]
#start_na    3    2    0
#end_na      0    2    5

As far as performance, this is the fastest method with @akrun's methods being in the ballpark.

v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_

Unit: microseconds
             expr     min       lq      mean   median       uq     max neval
     akrun_cumsum   175.7   182.15   193.580   186.55   200.80   354.7   100
        akrun_rle   168.6   199.25   210.635   209.25   221.00   289.3   100
    g_grothen_zoo  1848.5  1904.45  2008.994  1941.40  2001.35  4799.6   100
 g_grothen_reduce 12467.3 12888.10 14174.157 13445.15 15054.35 28241.6   100
        www_rleid  5357.2  5439.40  5741.471  5517.15  5947.15  8470.4   100
   bulat_and_cole    63.5    66.45    73.681    71.25    75.75    96.9   100

Code for reproducibility:

library(microbenchmark)
library(zoo)
library(data.table)

v_test3 <- sample(10000)
v_test3[c(1:3, 9998:10000)] <- NA_integer_

count_nas <- function(x) {
  nas <- is.na(x)

  if (sum(nas) == length(x)) {
    warning('all elements were NA')
    return(c(start_na = NA_integer_, end_na = NA_integer_))
  }

  c(start_na = which.min(nas) - 1,
    end_na = which.min(rev(nas)) - 1)
}

countNA <- function(x) {
  len <- function(fromLast = FALSE) length(na.locf(x, fromLast = fromLast))
  if (all(is.na(x))) c(left = NA, right = NA)
  else length(x) - c(left = len(), right = len(TRUE))
}

f1 <- function(vec, trail = FALSE) {
  if(trail) {
    vec <- rev(vec)
  }
  sum(!cumsum(!is.na(vec)))
}

count_fun <- function(x){
  y <- rleid(x)
  z <- split(x, y)[c(1, length(unique(y)))]
  ans <- sapply(z, function(x) sum(is.na(x)))
  return(unname(ans))
}

countNA2 <- function(x) {
  f <- function(x) sum(Reduce(all, is.na(x), acc = TRUE))
  if (all(is.na(x))) c(left = NA, right = NA)
  else c(left = f(x), right = f(rev(x)))
}

microbenchmark(
  akrun_cumsum = {
    f1(v_test3, TRUE)
    f1(v_test3, FALSE)
  }
  , 
  akrun_rle = {
    with(rle(is.na(v_test3)), lengths[values & seq_along(values) %in% c(1, length(values))])
  }
  ,
  g_grothen_zoo = {
    countNA(v_test3)
  }
  ,
  g_grothen_reduce = {
    countNA2(v_test3)
  }
  ,
  www_rleid = {
    count_fun(v_test3)
  }
  ,
  bulat_and_cole = {
    count_nas(v_test3)
  }
)

Tags:

R

Vector

Na