-
-
Notifications
You must be signed in to change notification settings - Fork 11.9k
Open
Description
np.unique has the axis option that allows, for example, to call unique on the rows of a matrix. I noticed however that it's quite slow. Creating a view of the data and calling unique on the view is faster by a factor of 3.
MVCE:
import numpy
import perfplot
def unique_axis(data):
return numpy.unique(data, axis=0)
def unique_row_view(data):
b = numpy.ascontiguousarray(data).view(
numpy.dtype((numpy.void, data.dtype.itemsize * data.shape[1]))
)
u = numpy.unique(b).view(data.dtype).reshape(-1, data.shape[1])
return u
def unique_scikit(ar):
if ar.ndim != 2:
raise ValueError("unique_rows() only makes sense for 2D arrays, "
"got %dd" % ar.ndim)
# the view in the next line only works if the array is C-contiguous
ar = numpy.ascontiguousarray(ar)
# np.unique() finds identical items in a raveled array. To make it
# see each row as a single item, we create a view of each row as a
# byte string of length itemsize times number of columns in `ar`
ar_row_view = ar.view('|S%d' % (ar.itemsize * ar.shape[1]))
_, unique_row_indices = numpy.unique(ar_row_view, return_index=True)
ar_out = ar[unique_row_indices]
return ar_out
perfplot.save(
"unique.png",
setup=lambda n: numpy.random.randint(0, 100, (n, 2)),
kernels=[unique_axis, unique_row_view, unique_scikit],
n_range=[2 ** k for k in range(20)],
)rubenvereecken, pdemarti and HansBrende
