@@ -3431,13 +3431,7 @@ def _indexer_from_factorized(labels, shape, compress=True):
3431
3431
comp_ids = group_index
3432
3432
max_group = com ._long_prod (shape )
3433
3433
3434
- if max_group > 1e6 :
3435
- # Use mergesort to avoid memory errors in counting sort
3436
- indexer = comp_ids .argsort (kind = 'mergesort' )
3437
- else :
3438
- indexer , _ = _algos .groupsort_indexer (comp_ids .astype (np .int64 ),
3439
- max_group )
3440
-
3434
+ indexer = _get_group_index_sorter (comp_ids .astype (np .int64 ), max_group )
3441
3435
return indexer
3442
3436
3443
3437
@@ -3560,21 +3554,27 @@ def _get_indices_dict(label_list, keys):
3560
3554
3561
3555
def _get_group_index_sorter (group_index , ngroups ):
3562
3556
"""
3563
- _algos.groupsort_indexer is at least O(ngroups), where
3557
+ _algos.groupsort_indexer implements `counting sort` and it is at least
3558
+ O(ngroups), where
3564
3559
ngroups = prod(shape)
3565
3560
shape = map(len, keys)
3566
3561
that is, linear in the number of combinations (cartesian product) of unique
3567
3562
values of groupby keys. This can be huge when doing multi-key groupby.
3568
- np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3569
- of the data-frame;
3563
+ np.argsort(kind='mergesort') is O(count x log(count)) where count is the
3564
+ length of the data-frame;
3565
+ Both algorithms are `stable` sort and that is necessary for correctness of
3566
+ groupby operations. e.g. consider:
3567
+ df.groupby(key)[col].transform('first')
3570
3568
"""
3571
3569
count = len (group_index )
3572
- if ngroups < count * np .log (count ): # taking complexities literally
3570
+ alpha = 0.0 # taking complexities literally; there may be
3571
+ beta = 1.0 # some room for fine-tuning these parameters
3572
+ if alpha + beta * ngroups < count * np .log (count ):
3573
3573
sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
3574
3574
ngroups )
3575
3575
return com ._ensure_platform_int (sorter )
3576
3576
else :
3577
- return group_index .argsort ()
3577
+ return group_index .argsort (kind = 'mergesort' )
3578
3578
3579
3579
3580
3580
def _compress_group_index (group_index , sort = True ):
0 commit comments