diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index ba62fbc..12fbe48 100644 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# +# # Copyright 2010 bit.ly # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -99,6 +99,70 @@ def test_median(): assert 4 == median([4,5,2,1,9,10]) # even-sized int list. (4+5)/2 = 4 assert "4.50" == "%.2f" % median([4.0,5,2,1,9,10]) #even-sized float list. (4.0+5)/2 = 4.5 +# +# Inverse normal, taken lovingly from: +# http://home.online.no/~pjacklam/notes/invnorm/impl/field/ltqnorm.txt +# +def ltqnorm( p ): + """ + Modified from the author's original perl code (original comments follow below) + by dfield@yahoo-inc.com. May 3, 2004. + + Lower tail quantile for standard normal distribution function. + + This function returns an approximation of the inverse cumulative + standard normal distribution function. I.e., given P, it returns + an approximation to the X satisfying P = Pr{Z <= X} where Z is a + random variable from the standard normal distribution. + + The algorithm uses a minimax approximation by rational functions + and the result has a relative error whose absolute value is less + than 1.15e-9. + + Author: Peter John Acklam + Time-stamp: 2000-07-19 18:26:14 + E-mail: pjacklam@online.no + WWW URL: http://home.online.no/~pjacklam + """ + + if p <= 0 or p >= 1: + # The original perl code exits here, we'll throw an exception instead + raise ValueError( "Argument to ltqnorm %f must be in open interval (0,1)" % p ) + + # Coefficients in rational approximations. + a = (-3.969683028665376e+01, 2.209460984245205e+02, \ + -2.759285104469687e+02, 1.383577518672690e+02, \ + -3.066479806614716e+01, 2.506628277459239e+00) + b = (-5.447609879822406e+01, 1.615858368580409e+02, \ + -1.556989798598866e+02, 6.680131188771972e+01, \ + -1.328068155288572e+01 ) + c = (-7.784894002430293e-03, -3.223964580411365e-01, \ + -2.400758277161838e+00, -2.549732539343734e+00, \ + 4.374664141464968e+00, 2.938163982698783e+00) + d = ( 7.784695709041462e-03, 3.224671290700398e-01, \ + 2.445134137142996e+00, 3.754408661907416e+00) + + # Define break-points. + plow = 0.02425 + phigh = 1 - plow + + # Rational approximation for lower region: + if p < plow: + q = math.sqrt(-2*math.log(p)) + return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \ + ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1) + + # Rational approximation for upper region: + if phigh < p: + q = math.sqrt(-2*math.log(1-p)) + return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \ + ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1) + + # Rational approximation for central region: + q = p - 0.5 + r = q*q + return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / \ + (((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1) def histogram(stream, options): """ @@ -160,7 +224,19 @@ def histogram(stream, options): if skipped: print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '') if options.mvsd: - print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data)) + mean_string = "%f" % mvsd.mean() + if options.confidence and 0 < float(options.confidence) < 1: + # Convert to float and adjust so it is the correct value for a half + # interval. + confidence = (float(options.confidence) + 1.0)/2 + if len(accepted_data) >= 30: + interval_width = ltqnorm(confidence) + interval_width *= mvsd.sd() / math.sqrt(samples) + # TODO(awreece) Students t distribution if <30 samples. + interval_width = Decimal(interval_width) + mean_string = "%f (+/- %f)" % (mvsd.mean(), interval_width) + + print "# Mean = %s; Variance = %f; SD = %f; Median %f" % (mean_string, mvsd.var(), mvsd.sd(), median(accepted_data)) print "# each * represents a count of %d" % bucket_scale bucket_min = min_v bucket_max = min_v @@ -172,7 +248,6 @@ def histogram(stream, options): if bucket_count: star_count = bucket_count / bucket_scale print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '*' * star_count) - if __name__ == "__main__": parser = OptionParser() @@ -185,6 +260,8 @@ def histogram(stream, options): help="Number of buckets to use for the histogram") parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True, help="Dissable the calculation of Mean, Vairance and SD. (improves performance)") + parser.add_option("-c", "--confidence", dest="confidence", + help="Confidence interval width (set to 0 or don't specify for no interval). Requires msvd.") (options, args) = parser.parse_args() if sys.stdin.isatty(): diff --git a/setup.py b/setup.py index ea37d5a..23631e7 100644 --- a/setup.py +++ b/setup.py @@ -20,5 +20,5 @@ 'data_hacks/ninety_five_percent.py', 'data_hacks/run_for.py', 'data_hacks/bar_chart.py', - 'data_hacks/sample.py'] - ) \ No newline at end of file + 'data_hacks/sample.py'], + )