Skip to content

Commit 6d278bd

Browse files
committed
BUG: use stable sort for group_index in groupby
1 parent 71bfb00 commit 6d278bd

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

pandas/core/groupby.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3431,13 +3431,7 @@ def _indexer_from_factorized(labels, shape, compress=True):
34313431
comp_ids = group_index
34323432
max_group = com._long_prod(shape)
34333433

3434-
if max_group > 1e6:
3435-
# Use mergesort to avoid memory errors in counting sort
3436-
indexer = comp_ids.argsort(kind='mergesort')
3437-
else:
3438-
indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64),
3439-
max_group)
3440-
3434+
indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group)
34413435
return indexer
34423436

34433437

@@ -3560,21 +3554,27 @@ def _get_indices_dict(label_list, keys):
35603554

35613555
def _get_group_index_sorter(group_index, ngroups):
35623556
"""
3563-
_algos.groupsort_indexer is at least O(ngroups), where
3557+
_algos.groupsort_indexer implements `counting sort` and it is at least
3558+
O(ngroups), where
35643559
ngroups = prod(shape)
35653560
shape = map(len, keys)
35663561
that is, linear in the number of combinations (cartesian product) of unique
35673562
values of groupby keys. This can be huge when doing multi-key groupby.
3568-
np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3569-
of the data-frame;
3563+
np.argsort(kind='mergesort') is O(count x log(count)) where count is the
3564+
length of the data-frame;
3565+
Both algorithms are `stable` sort and that is necessary for correctness of
3566+
groupby operations. e.g. consider:
3567+
df.groupby(key)[col].transform('first')
35703568
"""
35713569
count = len(group_index)
3572-
if ngroups < count * np.log(count): # taking complexities literally
3570+
alpha = 0.0 # taking complexities literally; there may be
3571+
beta = 1.0 # some room for fine-tuning these parameters
3572+
if alpha + beta * ngroups < count * np.log(count):
35733573
sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index),
35743574
ngroups)
35753575
return com._ensure_platform_int(sorter)
35763576
else:
3577-
return group_index.argsort()
3577+
return group_index.argsort(kind='mergesort')
35783578

35793579

35803580
def _compress_group_index(group_index, sort=True):

0 commit comments

Comments
 (0)