(C++) Chi-squared goodness-of-fit to a normal distribution test

February 24, 2017 · View on GitHub

 

 

 

 

 

(C++) Chi-squared goodness-of-fit to a normal distribution test

 

Chi-squared goodness-of-fit to a normal distribution test is a Math code snippet to perform the same chi-squared goodness-of-fit (to a normal distribution) as [1].

 


#include <algorithm> #include <cassert> #include <cmath> #include <functional> #include <iterator> #include <iostream> #include <numeric> #include <ostream> #include <vector> #include <boost/math/distributions/chi_squared.hpp> //From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm //Adapted from http://www.ma.ic.ac.uk/~mdavis/course_material/MOP/CumNormDist.txt double GetCumulativeDensityNormal(const double x) {   const double c0 = 0.2316419;   const double c1 = 1.330274429;   const double c2 = 1.821255978;   const double c3 = 1.781477937;   const double c4 = 0.356563782;   const double c5 = 0.319381530;   const double c6 = 0.398942280401;   const double negative = (x < 0 ? 1.0 : 0.0);   const double xPos = (x < 0.0 ? -x : x);   const double k = 1.0 / ( 1.0 + (c0 * xPos));   const double y1 = (((((((c1*k-c2)*k)+c3)*k)-c4)*k)+c5)*k;   const double y2 = 1.0 - (c6*std::exp(-0.5*xPos*xPos)*y1);   return ((1.0-negative)*y2) + (negative*(1.0-y2)); } //From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm double GetCumulativeDensityNormal(   const double x,   const double mean,   const double stddev) {   return GetCumulativeDensityNormal(     (x-mean) / stddev); } //From htpp://www.richelbilderbeek.nl/CppGetMean.htm double GetMean(const std::vector<double>& v) {   return std::accumulate(v.begin(),v.end(),0.0) / static_cast<double>(v.size()); } template <class T> struct SquareAccumulator   : public std::binary_function<T,T,T> {   const T operator()(const T& sum, const T& x) const     { return sum+(x*x); } }; //From http://www.richelbilderbeek.nl/CppGetStdDev.htm double GetStdDev(const std::vector<double>& v) {   assert(v.size() > 1     && "Can only calculate standard deviations from "        "data sets with size 2 or larger");   const double sum_x     = std::accumulate(v.begin(),v.end(),0.0);   const double sum_x_squared     = std::accumulate(v.begin(),v.end(),         0.0,SquareAccumulator<double>());   const double sz = static_cast<double>(v.size());   return std::sqrt(((sz*sum_x_squared)-(sum_x*sum_x))     / (sz*(sz-1.0))); } //Data from: // * David Heath. An introduction to experimental // design and statistics for biology. 1995. // ISBN: 1-85728-132-2 PB. Page 149, box 6.9: // Chi-squared goodness-of-fit test to a normal // distribution: body lengths of the // isopod (Sphaeroma Rugicauda) //Isopod sizes in millimetres const std::vector<double> GetIsopodSizes() {   std::vector<double> v;   for (int i=0; i!= 5; ++i) v.push_back(3.65);   for (int i=0; i!= 9; ++i) v.push_back(3.85);   for (int i=0; i!=20; ++i) v.push_back(4.05);   for (int i=0; i!=30; ++i) v.push_back(4.25);   for (int i=0; i!=29; ++i) v.push_back(4.45);   for (int i=0; i!=44; ++i) v.push_back(4.65);   for (int i=0; i!=44; ++i) v.push_back(4.85);   for (int i=0; i!=69; ++i) v.push_back(5.05);   for (int i=0; i!=50; ++i) v.push_back(5.25);   for (int i=0; i!=50; ++i) v.push_back(5.45);   for (int i=0; i!=29; ++i) v.push_back(5.65);   for (int i=0; i!=14; ++i) v.push_back(5.85);   for (int i=0; i!= 4; ++i) v.push_back(6.05);   for (int i=0; i!= 1; ++i) v.push_back(6.25);   for (int i=0; i!= 2; ++i) v.push_back(6.45);   //For fun, let's shuffle these values   std::random_shuffle(v.begin(),v.end());   return v; } const std::vector<std::pair<double,double> > GetRanges(   const std::size_t n_categories) {   std::vector<std::pair<double,double> > v;   double lower_limit = 3.45;   double range_bandwidth = 0.2;   for (std::size_t i=0; i!=n_categories; ++i)   {     v.push_back(std::make_pair(       lower_limit,lower_limit+range_bandwidth));     lower_limit+=range_bandwidth;   }   return v; } const std::vector<int> Tally(   const std::vector<double>& values,   const std::vector<std::pair<double,double> >& ranges) {   std::vector<int> tally(ranges.size());   const std::size_t n_values = values.size();   const std::size_t n_ranges = ranges.size();   for (std::size_t value_index = 0; value_index!=n_values; ++value_index)   {     assert(value_index < values.size());     const double value = values[value_index];     for (std::size_t range_index = 0; range_index!=n_ranges; ++range_index)     {       const std::pair<double,double> &range = ranges[range_index];       if ( range.first < value && value < range.second)       {         ++tally[range_index];         break;       }     }   }   assert( std::accumulate(tally.begin(),tally.end(),0)     == static_cast<int>(values.size())     && "Assume all values can be tallied in existing ranges");   return tally; } const std::vector<double> GetTallyExpected(   const std::vector<std::pair<double,double> >& ranges,   const double mean,   const double stdDev,   const std::size_t n) {   const std::size_t sz = ranges.size();   std::vector<double> v(sz,0.0);   for (std::size_t i = 0; i!=sz; ++i)   {     const double x_left = ranges[i].first;     const double x_right = ranges[i].second;     assert(x_left < x_right);     const double surface_left       = GetCumulativeDensityNormal(x_left ,mean,stdDev)       * static_cast<double>(n);     const double surface_right       = GetCumulativeDensityNormal(x_right,mean,stdDev)       * static_cast<double>(n);     assert(surface_left < surface_right);     v[i] = surface_right - surface_left;   }   return v; } const std::vector<double> CalculateRelativeError(   const std::vector<int>& tally_measured,   const std::vector<double>& tally_expected) {   assert(tally_measured.size() == tally_expected.size());   const std::size_t sz = tally_measured.size();   std::vector<double> v(sz);   for (std::size_t i = 0; i!=sz; ++i)   {     const double obs = static_cast<double>(tally_measured[i]);     const double exp = tally_expected[i];     assert(exp!=0.0);     v[i] = ((obs-exp)*(obs-exp))/exp;   }   return v; } int main() {   const std::size_t n_categories = 15;   const double degrees_of_freedom     = static_cast<double>(n_categories)     - 1.0 //We need to calculate the mean ourselves     - 1.0 //We need to calculate the standard deviation ourselves     - 1.0; //We need to calculate the sample size ourselves   const std::vector<double> isopod_sizes = GetIsopodSizes();   const double mean = GetMean(isopod_sizes);   const double stdDev = GetStdDev(isopod_sizes);   const std::vector<std::pair<double,double> > ranges = GetRanges(n_categories);   assert(n_categories == ranges.size());   const std::vector<int> tally_measured = Tally(isopod_sizes,ranges);   const std::vector<double> tally_expected     = GetTallyExpected(ranges,mean,stdDev,isopod_sizes.size());   const std::vector<double> rel_error     = CalculateRelativeError(tally_measured,tally_expected);   for (std::size_t i=0; i!=n_categories; ++i)   {     std::cout       << ranges[i].first << "-" << ranges[i].second << ":\t"       << tally_measured[i] << "\t"       << tally_expected[i] << "\t"       << rel_error[i] << "\n";   }   const double significance_level = 0.05;   const double chi_squared_value     = std::accumulate(rel_error.begin(),rel_error.end(),0.0);   boost::math::chi_squared_distribution<double> distribution(degrees_of_freedom);   const double critical_value     = boost::math::quantile(boost::math::complement(distribution, significance_level));   std::cout     << "Mean size: " << mean     << "\nStdDev size: " << stdDev     << "\nSUM observer: "       << std::accumulate(tally_measured.begin(),tally_measured.end(), 0)     << "\nSUM expected: "       << std::accumulate(tally_expected.begin(),tally_expected.end(),0.0)     << "\nChi-square value: " << chi_squared_value     << "\nSignificance level: " << significance_level     << "\nDegrees of freedom: " << degrees_of_freedom     << "\nCritical value: " << critical_value << '\n';   if (chi_squared_value < critical_value)   {     std::cout       << "Cannot reject null hypothesis that the measured values "          "do follow a normal distribution" << std::endl;   }   else   {     std::cout       << "Reject null hypothesis that the measured values "          "do follow a normal distribution" << std::endl;   } }

 

Screen output

 


Starting /MyFolder/MyProject... 3.45-3.65: 5 2.08394 4.08045 3.65-3.85: 9 5.04813 3.09368 3.85-4.05: 20 10.6748 8.14617 4.05-4.25: 30 19.7052 5.37843 4.25-4.45: 29 31.7539 0.238841 4.45-4.65: 44 44.6702 0.0100565 4.65-4.85: 44 54.8586 2.14932 4.85-5.05: 69 58.8136 1.76427 5.05-5.25: 50 55.0452 0.462415 5.25-5.45: 50 44.9747 0.561515 5.45-5.65: 29 32.0791 0.295544 5.65-5.85: 14 19.9747 1.78711 5.85-6.05: 4 10.8576 4.33126 6.05-6.25: 1 5.15205 3.34615 6.25-6.45: 2 2.13408 0.00842393 Mean size: 4.9525 StdDev size: 0.539557 SUM observer: 400 SUM expected: 397.826 Chi-square value: 35.6536 Significance level: 0.05 Degrees of freedom: 12 Critical value: 21.0261 Reject null hypothesis that the measured values do follow a normal distribution /MyFolder/MyProject exited with code 0

 

Note that I draw a different conclusion than [1]. This is probably due that I did not have the original body sizes, but recreated these from a tally.

 

 

 

 

 

References

 

  1. David Heath. An introduction to experimental design and statistics for biology. 1995. ISBN: 1-85728-132-2 PB. Page 149, box 6.9: Chi-squared goodness-of-fit test to a normal distribution: body lengths of the isopod (Sphaeroma Rugicauda)