|
73 | 73 | 2.5 |
74 | 74 |
|
75 | 75 |
|
| 76 | +Statistics for relations between two inputs |
| 77 | +------------------------------------------- |
| 78 | +
|
| 79 | +================== ==================================================== |
| 80 | +Function Description |
| 81 | +================== ==================================================== |
| 82 | +covariance Sample covariance for two variables. |
| 83 | +correlation Pearson's correlation coefficient for two variables. |
| 84 | +linear_regression Intercept and slope for simple linear regression. |
| 85 | +================== ==================================================== |
| 86 | +
|
| 87 | +Calculate covariance, Pearson's correlation, and simple linear regression |
| 88 | +for two inputs: |
| 89 | +
|
| 90 | +>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] |
| 91 | +>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] |
| 92 | +>>> covariance(x, y) |
| 93 | +0.75 |
| 94 | +>>> correlation(x, y) #doctest: +ELLIPSIS |
| 95 | +0.31622776601... |
| 96 | +>>> linear_regression(x, y) #doctest: |
| 97 | +LinearRegression(intercept=1.5, slope=0.1) |
| 98 | +
|
| 99 | +
|
76 | 100 | Exceptions |
77 | 101 | ---------- |
78 | 102 |
|
|
98 | 122 | 'quantiles', |
99 | 123 | 'stdev', |
100 | 124 | 'variance', |
| 125 | + 'correlation', |
| 126 | + 'covariance', |
| 127 | + 'linear_regression', |
101 | 128 | ] |
102 | 129 |
|
103 | 130 | import math |
|
110 | 137 | from bisect import bisect_left, bisect_right |
111 | 138 | from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum |
112 | 139 | from operator import itemgetter |
113 | | -from collections import Counter |
| 140 | +from collections import Counter, namedtuple |
114 | 141 |
|
115 | 142 | # === Exceptions === |
116 | 143 |
|
@@ -826,6 +853,113 @@ def pstdev(data, mu=None): |
826 | 853 | return math.sqrt(var) |
827 | 854 |
|
828 | 855 |
|
| 856 | +# === Statistics for relations between two inputs === |
| 857 | + |
| 858 | +# See https://en.wikipedia.org/wiki/Covariance |
| 859 | +# https://en.wikipedia.org/wiki/Pearson_correlation_coefficient |
| 860 | +# https://en.wikipedia.org/wiki/Simple_linear_regression |
| 861 | + |
| 862 | + |
| 863 | +def covariance(x, y, /): |
| 864 | + """Covariance |
| 865 | +
|
| 866 | + Return the sample covariance of two inputs *x* and *y*. Covariance |
| 867 | + is a measure of the joint variability of two inputs. |
| 868 | +
|
| 869 | + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] |
| 870 | + >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3] |
| 871 | + >>> covariance(x, y) |
| 872 | + 0.75 |
| 873 | + >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 874 | + >>> covariance(x, z) |
| 875 | + -7.5 |
| 876 | + >>> covariance(z, x) |
| 877 | + -7.5 |
| 878 | +
|
| 879 | + """ |
| 880 | + n = len(x) |
| 881 | + if len(y) != n: |
| 882 | + raise StatisticsError('covariance requires that both inputs have same number of data points') |
| 883 | + if n < 2: |
| 884 | + raise StatisticsError('covariance requires at least two data points') |
| 885 | + xbar = mean(x) |
| 886 | + ybar = mean(y) |
| 887 | + total = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) |
| 888 | + return total / (n - 1) |
| 889 | + |
| 890 | + |
| 891 | +def correlation(x, y, /): |
| 892 | + """Pearson's correlation coefficient |
| 893 | +
|
| 894 | + Return the Pearson's correlation coefficient for two inputs. Pearson's |
| 895 | + correlation coefficient *r* takes values between -1 and +1. It measures the |
| 896 | + strength and direction of the linear relationship, where +1 means very |
| 897 | + strong, positive linear relationship, -1 very strong, negative linear |
| 898 | + relationship, and 0 no linear relationship. |
| 899 | +
|
| 900 | + >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9] |
| 901 | + >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 902 | + >>> correlation(x, x) |
| 903 | + 1.0 |
| 904 | + >>> correlation(x, y) |
| 905 | + -1.0 |
| 906 | +
|
| 907 | + """ |
| 908 | + n = len(x) |
| 909 | + if len(y) != n: |
| 910 | + raise StatisticsError('correlation requires that both inputs have same number of data points') |
| 911 | + if n < 2: |
| 912 | + raise StatisticsError('correlation requires at least two data points') |
| 913 | + cov = covariance(x, y) |
| 914 | + stdx = stdev(x) |
| 915 | + stdy = stdev(y) |
| 916 | + try: |
| 917 | + return cov / (stdx * stdy) |
| 918 | + except ZeroDivisionError: |
| 919 | + raise StatisticsError('at least one of the inputs is constant') |
| 920 | + |
| 921 | + |
| 922 | +LinearRegression = namedtuple('LinearRegression', ['intercept', 'slope']) |
| 923 | + |
| 924 | + |
| 925 | +def linear_regression(regressor, dependent_variable, /): |
| 926 | + """Intercept and slope for simple linear regression |
| 927 | +
|
| 928 | + Return the intercept and slope of simple linear regression |
| 929 | + parameters estimated using ordinary least squares. Simple linear |
| 930 | + regression describes relationship between *regressor* and |
| 931 | + *dependent variable* in terms of linear function:: |
| 932 | +
|
| 933 | + dependent_variable = intercept + slope * regressor + noise |
| 934 | +
|
| 935 | + where ``intercept`` and ``slope`` are the regression parameters that are |
| 936 | + estimated, and noise term is an unobserved random variable, for the |
| 937 | + variability of the data that was not explained by the linear regression |
| 938 | + (it is equal to the difference between prediction and the actual values |
| 939 | + of dependent variable). |
| 940 | +
|
| 941 | + The parameters are returned as a named tuple. |
| 942 | +
|
| 943 | + >>> regressor = [1, 2, 3, 4, 5] |
| 944 | + >>> noise = NormalDist().samples(5, seed=42) |
| 945 | + >>> dependent_variable = [2 + 3 * regressor[i] + noise[i] for i in range(5)] |
| 946 | + >>> linear_regression(regressor, dependent_variable) #doctest: +ELLIPSIS |
| 947 | + LinearRegression(intercept=1.75684970486..., slope=3.09078914170...) |
| 948 | +
|
| 949 | + """ |
| 950 | + n = len(regressor) |
| 951 | + if len(dependent_variable) != n: |
| 952 | + raise StatisticsError('linear regression requires that both inputs have same number of data points') |
| 953 | + if n < 2: |
| 954 | + raise StatisticsError('linear regression requires at least two data points') |
| 955 | + try: |
| 956 | + slope = covariance(regressor, dependent_variable) / variance(regressor) |
| 957 | + except ZeroDivisionError: |
| 958 | + raise StatisticsError('regressor is constant') |
| 959 | + intercept = mean(dependent_variable) - slope * mean(regressor) |
| 960 | + return LinearRegression(intercept=intercept, slope=slope) |
| 961 | + |
| 962 | + |
829 | 963 | ## Normal Distribution ##################################################### |
830 | 964 |
|
831 | 965 |
|
|
0 commit comments