# numpy 1D array: mask elements that repeat more than n times

Approach #1 : Here's a vectorized way -

from scipy.ndimage.morphology import binary_dilation

def keep_N_per_group(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]


Sample run -

In [42]: a
Out[42]: array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])

In [43]: keep_N_per_group(a, N=3)
Out[43]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])


Approach #2 : A bit more compact version -

def keep_N_per_group_v2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]


Approach #3 : Using the grouped-counts and np.repeat (won't give us the mask though) -

def keep_N_per_group_v3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))


Approach #4 : With a view-based method -

from skimage.util import view_as_windows

def keep_N_per_group_v4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]


Approach #5 : With a view-based method without indices from flatnonzero -

def keep_N_per_group_v5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]


I want to present a solution using numba which should be fairly easy to understand. I assume that you want to "mask" consecutive repeating items:

import numpy as np
import numba as nb

@nb.njit

current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1


For example:

>>> bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])
array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])


## Performance:

Using simple_benchmark - however I haven't included all approaches. It's a log-log scale:

It seems like the numba solution cannot beat the solution from Paul Panzer which seems to be faster for large arrays by a bit (and doesn't require an additional dependency).

However both seem to outperform the other solutions, but they do return a mask instead of the "filtered" array.

import numpy as np
import numba as nb
from simple_benchmark import BenchmarkBuilder, MultiArgument

b = BenchmarkBuilder()

bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])

@nb.njit

current = arr[0]
count = 0
for idx, item in enumerate(arr):
if item == current:
count += 1
else:
current = item
count = 1

def MSeifert(arr, n):

from scipy.ndimage.morphology import binary_dilation

def Divakar_1(a, N):
k = np.ones(N,dtype=bool)
m = np.r_[True,a[:-1]!=a[1:]]
return a[binary_dilation(m,k,origin=-(N//2))]

def Divakar_2(a, N):
k = np.ones(N,dtype=bool)
return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]

def Divakar_3(a, N):
m = np.r_[True,a[:-1]!=a[1:],True]
idx = np.flatnonzero(m)
c = np.diff(idx)
return np.repeat(a[idx[:-1]],np.minimum(c,N))

from skimage.util import view_as_windows

def Divakar_4(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
idx = np.flatnonzero(m)
v = idx<len(w)
w[idx[v]] = 1
if v.all()==0:
m[idx[v.argmin()]:] = 1
return a[m]

def Divakar_5(a, N):
m = np.r_[True,a[:-1]!=a[1:]]
w = view_as_windows(m,N)
last_idx = len(a)-m[::-1].argmax()-1
w[m[:-N+1]] = 1
m[last_idx:last_idx+N] = 1
return a[m]

def PaulPanzer(a,N):

import random

def argument_provider():
for exp in range(2, 20):
size = 2**exp
yield size, MultiArgument([np.array([random.randint(0, 5) for _ in range(size)]), 3])

r = b.run()
import matplotlib.pyplot as plt

plt.figure(figsize=[10, 8])
r.plot()


Disclaimer: this is just a sounder implementation of @FlorianH's idea:

def f(a,N):


For larger arrays this makes a huge difference:

a = np.arange(1000).repeat(np.random.randint(0,10,1000))
N = 3

print(timeit(lambda:f(a,N),number=1000)*1000,"us")
# 5.443050000394578 us

# compare to
print(timeit(lambda:[True for _ in range(N)] + list(bins[:-N] != bins[N:]),number=1000)*1000,"us")
# 76.18969900067896 us