(C++) Chi-squared goodness-of-fit to a normal distribution test
February 24, 2017 · View on GitHub
(C++) Chi-squared goodness-of-fit to a normal distribution test
Chi-squared goodness-of-fit to a normal distribution test is a Math code snippet to perform the same chi-squared goodness-of-fit (to a normal distribution) as [1].
#include <algorithm> #include <cassert> #include <cmath> #include <functional> #include <iterator> #include <iostream> #include <numeric> #include <ostream> #include <vector> #include <boost/math/distributions/chi_squared.hpp> //From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm //Adapted from http://www.ma.ic.ac.uk/~mdavis/course_material/MOP/CumNormDist.txt double GetCumulativeDensityNormal(const double x) { const double c0 = 0.2316419; const double c1 = 1.330274429; const double c2 = 1.821255978; const double c3 = 1.781477937; const double c4 = 0.356563782; const double c5 = 0.319381530; const double c6 = 0.398942280401; const double negative = (x < 0 ? 1.0 : 0.0); const double xPos = (x < 0.0 ? -x : x); const double k = 1.0 / ( 1.0 + (c0 * xPos)); const double y1 = (((((((c1*k-c2)*k)+c3)*k)-c4)*k)+c5)*k; const double y2 = 1.0 - (c6*std::exp(-0.5*xPos*xPos)*y1); return ((1.0-negative)*y2) + (negative*(1.0-y2)); } //From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm double GetCumulativeDensityNormal( const double x, const double mean, const double stddev) { return GetCumulativeDensityNormal( (x-mean) / stddev); } //From htpp://www.richelbilderbeek.nl/CppGetMean.htm double GetMean(const std::vector<double>& v) { return std::accumulate(v.begin(),v.end(),0.0) / static_cast<double>(v.size()); } template <class T> struct SquareAccumulator : public std::binary_function<T,T,T> { const T operator()(const T& sum, const T& x) const { return sum+(x*x); } }; //From http://www.richelbilderbeek.nl/CppGetStdDev.htm double GetStdDev(const std::vector<double>& v) { assert(v.size() > 1 && "Can only calculate standard deviations from " "data sets with size 2 or larger"); const double sum_x = std::accumulate(v.begin(),v.end(),0.0); const double sum_x_squared = std::accumulate(v.begin(),v.end(), 0.0,SquareAccumulator<double>()); const double sz = static_cast<double>(v.size()); return std::sqrt(((sz*sum_x_squared)-(sum_x*sum_x)) / (sz*(sz-1.0))); } //Data from: // * David Heath. An introduction to experimental // design and statistics for biology. 1995. // ISBN: 1-85728-132-2 PB. Page 149, box 6.9: // Chi-squared goodness-of-fit test to a normal // distribution: body lengths of the // isopod (Sphaeroma Rugicauda) //Isopod sizes in millimetres const std::vector<double> GetIsopodSizes() { std::vector<double> v; for (int i=0; i!= 5; ++i) v.push_back(3.65); for (int i=0; i!= 9; ++i) v.push_back(3.85); for (int i=0; i!=20; ++i) v.push_back(4.05); for (int i=0; i!=30; ++i) v.push_back(4.25); for (int i=0; i!=29; ++i) v.push_back(4.45); for (int i=0; i!=44; ++i) v.push_back(4.65); for (int i=0; i!=44; ++i) v.push_back(4.85); for (int i=0; i!=69; ++i) v.push_back(5.05); for (int i=0; i!=50; ++i) v.push_back(5.25); for (int i=0; i!=50; ++i) v.push_back(5.45); for (int i=0; i!=29; ++i) v.push_back(5.65); for (int i=0; i!=14; ++i) v.push_back(5.85); for (int i=0; i!= 4; ++i) v.push_back(6.05); for (int i=0; i!= 1; ++i) v.push_back(6.25); for (int i=0; i!= 2; ++i) v.push_back(6.45); //For fun, let's shuffle these values std::random_shuffle(v.begin(),v.end()); return v; } const std::vector<std::pair<double,double> > GetRanges( const std::size_t n_categories) { std::vector<std::pair<double,double> > v; double lower_limit = 3.45; double range_bandwidth = 0.2; for (std::size_t i=0; i!=n_categories; ++i) { v.push_back(std::make_pair( lower_limit,lower_limit+range_bandwidth)); lower_limit+=range_bandwidth; } return v; } const std::vector<int> Tally( const std::vector<double>& values, const std::vector<std::pair<double,double> >& ranges) { std::vector<int> tally(ranges.size()); const std::size_t n_values = values.size(); const std::size_t n_ranges = ranges.size(); for (std::size_t value_index = 0; value_index!=n_values; ++value_index) { assert(value_index < values.size()); const double value = values[value_index]; for (std::size_t range_index = 0; range_index!=n_ranges; ++range_index) { const std::pair<double,double> &range = ranges[range_index]; if ( range.first < value && value < range.second) { ++tally[range_index]; break; } } } assert( std::accumulate(tally.begin(),tally.end(),0) == static_cast<int>(values.size()) && "Assume all values can be tallied in existing ranges"); return tally; } const std::vector<double> GetTallyExpected( const std::vector<std::pair<double,double> >& ranges, const double mean, const double stdDev, const std::size_t n) { const std::size_t sz = ranges.size(); std::vector<double> v(sz,0.0); for (std::size_t i = 0; i!=sz; ++i) { const double x_left = ranges[i].first; const double x_right = ranges[i].second; assert(x_left < x_right); const double surface_left = GetCumulativeDensityNormal(x_left ,mean,stdDev) * static_cast<double>(n); const double surface_right = GetCumulativeDensityNormal(x_right,mean,stdDev) * static_cast<double>(n); assert(surface_left < surface_right); v[i] = surface_right - surface_left; } return v; } const std::vector<double> CalculateRelativeError( const std::vector<int>& tally_measured, const std::vector<double>& tally_expected) { assert(tally_measured.size() == tally_expected.size()); const std::size_t sz = tally_measured.size(); std::vector<double> v(sz); for (std::size_t i = 0; i!=sz; ++i) { const double obs = static_cast<double>(tally_measured[i]); const double exp = tally_expected[i]; assert(exp!=0.0); v[i] = ((obs-exp)*(obs-exp))/exp; } return v; } int main() { const std::size_t n_categories = 15; const double degrees_of_freedom = static_cast<double>(n_categories) - 1.0 //We need to calculate the mean ourselves - 1.0 //We need to calculate the standard deviation ourselves - 1.0; //We need to calculate the sample size ourselves const std::vector<double> isopod_sizes = GetIsopodSizes(); const double mean = GetMean(isopod_sizes); const double stdDev = GetStdDev(isopod_sizes); const std::vector<std::pair<double,double> > ranges = GetRanges(n_categories); assert(n_categories == ranges.size()); const std::vector<int> tally_measured = Tally(isopod_sizes,ranges); const std::vector<double> tally_expected = GetTallyExpected(ranges,mean,stdDev,isopod_sizes.size()); const std::vector<double> rel_error = CalculateRelativeError(tally_measured,tally_expected); for (std::size_t i=0; i!=n_categories; ++i) { std::cout << ranges[i].first << "-" << ranges[i].second << ":\t" << tally_measured[i] << "\t" << tally_expected[i] << "\t" << rel_error[i] << "\n"; } const double significance_level = 0.05; const double chi_squared_value = std::accumulate(rel_error.begin(),rel_error.end(),0.0); boost::math::chi_squared_distribution<double> distribution(degrees_of_freedom); const double critical_value = boost::math::quantile(boost::math::complement(distribution, significance_level)); std::cout << "Mean size: " << mean << "\nStdDev size: " << stdDev << "\nSUM observer: " << std::accumulate(tally_measured.begin(),tally_measured.end(), 0) << "\nSUM expected: " << std::accumulate(tally_expected.begin(),tally_expected.end(),0.0) << "\nChi-square value: " << chi_squared_value << "\nSignificance level: " << significance_level << "\nDegrees of freedom: " << degrees_of_freedom << "\nCritical value: " << critical_value << '\n'; if (chi_squared_value < critical_value) { std::cout << "Cannot reject null hypothesis that the measured values " "do follow a normal distribution" << std::endl; } else { std::cout << "Reject null hypothesis that the measured values " "do follow a normal distribution" << std::endl; } }
Screen output
Starting /MyFolder/MyProject... 3.45-3.65: 5 2.08394 4.08045 3.65-3.85: 9 5.04813 3.09368 3.85-4.05: 20 10.6748 8.14617 4.05-4.25: 30 19.7052 5.37843 4.25-4.45: 29 31.7539 0.238841 4.45-4.65: 44 44.6702 0.0100565 4.65-4.85: 44 54.8586 2.14932 4.85-5.05: 69 58.8136 1.76427 5.05-5.25: 50 55.0452 0.462415 5.25-5.45: 50 44.9747 0.561515 5.45-5.65: 29 32.0791 0.295544 5.65-5.85: 14 19.9747 1.78711 5.85-6.05: 4 10.8576 4.33126 6.05-6.25: 1 5.15205 3.34615 6.25-6.45: 2 2.13408 0.00842393 Mean size: 4.9525 StdDev size: 0.539557 SUM observer: 400 SUM expected: 397.826 Chi-square value: 35.6536 Significance level: 0.05 Degrees of freedom: 12 Critical value: 21.0261 Reject null hypothesis that the measured values do follow a normal distribution /MyFolder/MyProject exited with code 0
Note that I draw a different conclusion than [1]. This is probably due that I did not have the original body sizes, but recreated these from a tally.
References
- David Heath. An introduction to experimental design and statistics for biology. 1995. ISBN: 1-85728-132-2 PB. Page 149, box 6.9: Chi-squared goodness-of-fit test to a normal distribution: body lengths of the isopod (Sphaeroma Rugicauda)