@@ -348,22 +348,6 @@ def _convert(value, T):
348348 raise
349349
350350
351- def _find_lteq (a , x ):
352- 'Locate the leftmost value exactly equal to x'
353- i = bisect_left (a , x )
354- if i != len (a ) and a [i ] == x :
355- return i
356- raise ValueError
357-
358-
359- def _find_rteq (a , l , x ):
360- 'Locate the rightmost value exactly equal to x'
361- i = bisect_right (a , x , lo = l )
362- if i != (len (a ) + 1 ) and a [i - 1 ] == x :
363- return i - 1
364- raise ValueError
365-
366-
367351def _fail_neg (values , errmsg = 'negative value' ):
368352 """Iterate over values, failing if any are less than zero."""
369353 for x in values :
@@ -628,57 +612,75 @@ def median_high(data):
628612
629613
630614def median_grouped (data , interval = 1 ):
631- """Return the 50th percentile (median) of grouped continuous data.
632-
633- >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
634- 3.7
635- >>> median_grouped([52, 52, 53, 54])
636- 52.5
637-
638- This calculates the median as the 50th percentile, and should be
639- used when your data is continuous and grouped. In the above example,
640- the values 1, 2, 3, etc. actually represent the midpoint of classes
641- 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
642- class 3.5-4.5, and interpolation is used to estimate it.
643-
644- Optional argument ``interval`` represents the class interval, and
645- defaults to 1. Changing the class interval naturally will change the
646- interpolated 50th percentile value:
647-
648- >>> median_grouped([1, 3, 3, 5, 7], interval=1)
649- 3.25
650- >>> median_grouped([1, 3, 3, 5, 7], interval=2)
651- 3.5
652-
653- This function does not check whether the data points are at least
654- ``interval`` apart.
615+ """Estimates the median for numeric data binned around the midpoints
616+ of consecutive, fixed-width intervals.
617+
618+ The *data* can be any iterable of numeric data with each value being
619+ exactly the midpoint of a bin. At least one value must be present.
620+
621+ The *interval* is width of each bin.
622+
623+ For example, demographic information may have been summarized into
624+ consecutive ten-year age groups with each group being represented
625+ by the 5-year midpoints of the intervals:
626+
627+ >>> demographics = Counter({
628+ ... 25: 172, # 20 to 30 years old
629+ ... 35: 484, # 30 to 40 years old
630+ ... 45: 387, # 40 to 50 years old
631+ ... 55: 22, # 50 to 60 years old
632+ ... 65: 6, # 60 to 70 years old
633+ ... })
634+
635+ The 50th percentile (median) is the 536th person out of the 1071
636+ member cohort. That person is in the 30 to 40 year old age group.
637+
638+ The regular median() function would assume that everyone in the
639+ tricenarian age group was exactly 35 years old. A more tenable
640+ assumption is that the 484 members of that age group are evenly
641+ distributed between 30 and 40. For that, we use median_grouped().
642+
643+ >>> data = list(demographics.elements())
644+ >>> median(data)
645+ 35
646+ >>> round(median_grouped(data, interval=10), 1)
647+ 37.5
648+
649+ The caller is responsible for making sure the data points are separated
650+ by exact multiples of *interval*. This is essential for getting a
651+ correct result. The function does not check this precondition.
652+
655653 """
656654 data = sorted (data )
657655 n = len (data )
658656 if n == 0 :
659657 raise StatisticsError ("no median for empty data" )
660658 elif n == 1 :
661659 return data [0 ]
660+
662661 # Find the value at the midpoint. Remember this corresponds to the
663- # centre of the class interval.
662+ # midpoint of the class interval.
664663 x = data [n // 2 ]
664+
665+ # Generate a clear error message for non-numeric data
665666 for obj in (x , interval ):
666667 if isinstance (obj , (str , bytes )):
667- raise TypeError ('expected number but got %r' % obj )
668+ raise TypeError (f'expected a number but got { obj !r} ' )
669+
670+ # Using O(log n) bisection, find where all the x values occur in the data.
671+ # All x will lie within data[i:j].
672+ i = bisect_left (data , x )
673+ j = bisect_right (data , x , lo = i )
674+
675+ # Interpolate the median using the formula found at:
676+ # https://www.cuemath.com/data/median-of-grouped-data/
668677 try :
669678 L = x - interval / 2 # The lower limit of the median interval.
670679 except TypeError :
671- # Mixed type. For now we just coerce to float.
680+ # Coerce mixed types to float.
672681 L = float (x ) - float (interval ) / 2
673-
674- # Uses bisection search to search for x in data with log(n) time complexity
675- # Find the position of leftmost occurrence of x in data
676- l1 = _find_lteq (data , x )
677- # Find the position of rightmost occurrence of x in data[l1...len(data)]
678- # Assuming always l1 <= l2
679- l2 = _find_rteq (data , l1 , x )
680- cf = l1
681- f = l2 - l1 + 1
682+ cf = i # Cumulative frequency of the preceding interval
683+ f = j - i # Number of elements in the median internal
682684 return L + interval * (n / 2 - cf ) / f
683685
684686
0 commit comments