@@ -484,3 +484,75 @@ def f(g):
484484
485485groupby_agg_builtins1 = Benchmark ("df.groupby('jim').agg([sum, min, max])" , setup )
486486groupby_agg_builtins2 = Benchmark ("df.groupby(['jim', 'joe']).agg([sum, min, max])" , setup )
487+
488+ #----------------------------------------------------------------------
489+ # groupby with a large value for ngroups
490+
491+ setup = common_setup + """
492+ np.random.seed(1234)
493+ ngroups = 10000
494+ size = ngroups * 10
495+ rng = np.arange(ngroups)
496+ df = DataFrame(dict(
497+ timestamp=rng.take(np.random.randint(0, ngroups, size=size)),
498+ value=np.random.randint(0, size, size=size)
499+ ))
500+ """
501+
502+ no_arg_func_list = [
503+ 'all' ,
504+ 'any' ,
505+ 'count' ,
506+ 'cumcount' ,
507+ 'cummax' ,
508+ 'cummin' ,
509+ 'cumprod' ,
510+ 'cumsum' ,
511+ 'describe' ,
512+ 'diff' ,
513+ 'first' ,
514+ 'head' ,
515+ 'last' ,
516+ 'mad' ,
517+ 'max' ,
518+ 'mean' ,
519+ 'median' ,
520+ 'min' ,
521+ 'nunique' ,
522+ 'pct_change' ,
523+ 'prod' ,
524+ 'rank' ,
525+ 'sem' ,
526+ 'size' ,
527+ 'skew' ,
528+ 'std' ,
529+ 'sum' ,
530+ 'tail' ,
531+ 'unique' ,
532+ 'var' ,
533+ 'value_counts' ,
534+ ]
535+
536+
537+ _stmt_template = "df.groupby('value')['timestamp'].%s"
538+ START_DATE = datetime (2011 , 7 , 1 )
539+
540+
541+ def make_large_ngroups_bmark (func_name , func_args = '' ):
542+ bmark_name = 'groupby_large_ngroups_%s' % func_name
543+ stmt = _stmt_template % ('%s(%s)' % (func_name , func_args ))
544+ bmark = Benchmark (stmt , setup , start_date = START_DATE )
545+ # MUST set name
546+ bmark .name = bmark_name
547+ return bmark
548+
549+
550+ def inject_bmark_into_globals (bmark ):
551+ if not bmark .name :
552+ raise AssertionError ('benchmark must have a name' )
553+ globals ()[bmark .name ] = bmark
554+
555+
556+ for func_name in no_arg_func_list :
557+ bmark = make_large_ngroups_bmark (func_name )
558+ inject_bmark_into_globals (bmark )
0 commit comments