- Looking at all the reading scores and math scores in each table, students tend to do better in reading than in math.
- Looking at the top performing schools, bottom performing schools, and passing rates by school type, students have a much higher passing rate in charter schools (~95%) than in district schools (~74%).
- Looking at the summary of performance by spending ranges per student, it seems that students have a higher passing rate in the lower spending ranges (<$615).
# Import dependencies
import pandas as pd
import numpy as np
# Create paths to csv files
school_path = "raw_data/schools_complete.csv"
student_path = "raw_data/students_complete.csv"
# Read the csv files
schools = pd.read_csv(school_path)
schools = schools.rename(columns={"name": "school"}) # so we can merge later
students = pd.read_csv(student_path)
students = pd.DataFrame(students)
# Calculate total schools, students, and budget of district
total_schools = schools['School ID'].count()
total_students = schools['size'].sum()
total_budget = schools['budget'].sum()
# Calculate average math and reading scores
avg_math = students['math_score'].mean()
avg_read = students['reading_score'].mean()
# Calculate percent passing rates
pass_math = students.loc[students['math_score'] >= 70]
per_math = len(pass_math)/ total_students * 100
pass_read = students.loc[students['reading_score'] >= 70]
per_read = len(pass_read)/ total_students * 100
overall = (per_math + per_read)/2
# Create dataframe for district summary
district_summary = pd.DataFrame({"Total Schools": total_schools,
"Total Students": total_students,
"Total Budget": total_budget,
"Average Math Score": avg_math,
"Average Reading Score": avg_read,
"% Passing Math": per_math,
"% Passing Reading": per_read,
"% Overall Passing Rate": [overall]})
#Reorganize columns with double brackets
district_summary = district_summary[["Total Schools", "Total Students", "Total Budget",
"Average Math Score", "Average Reading Score",
"% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]
# Format to "\$0,000.00"
district_summary['Total Budget'] = district_summary['Total Budget'].map('${:,.2f}'.format)
district_summary
|
Total Schools |
Total Students |
Total Budget |
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
% Overall Passing Rate |
| 0 |
15 |
39170 |
\$24,649,428.00 |
78.985371 |
81.87784 |
74.980853 |
85.805463 |
80.393158 |
# Merge schools and students
merge_df = pd.merge(schools, students, on="school")
merge_df = merge_df.drop(['School ID', 'Student ID'], axis=1)
# Set school as index, get type, total students, total budget and per student budget
school_index = schools.set_index('school')
school_type = school_index['type']
school_students = school_index['size']
school_budget = school_index['budget']
psb = school_budget/school_students
# Set school as index and groupby school for merge_df to get average scores by school
index = merge_df.set_index('school')
grouped = index.groupby(['school'])
# Get average math and reading score
school_avg_math = grouped['math_score'].mean()
school_avg_read = grouped['reading_score'].mean()
# Calculate percent that passed math
total_stu = grouped['name'].count()
grouped_math = pass_math.groupby('school')
school_pass_math = grouped_math['name'].count()/total_stu*100
# Calculate percent that passed reading
grouped_reading = pass_read.groupby('school')
school_pass_read = grouped_reading['name'].count()/total_stu*100
# Calculate overall passing rate
overall_pass = (school_pass_math + school_pass_read)/2
# Create dataframe for school summary
school_summary = pd.DataFrame({"School Type": school_type,
"Total Students": school_students,
"Total School Budget": school_budget,
"Per Student Budget": psb,
"Average Math Score": school_avg_math,
"Average Reading Score": school_avg_read,
"% Passing Math": school_pass_math,
"% Passing Reading": school_pass_read,
"Overall Passing Rate": overall_pass})
# Reorganize school summary columns
school_summary = school_summary[["School Type", "Total Students", "Total School Budget",
"Per Student Budget", "Average Math Score", "Average Reading Score",
"% Passing Math", "% Passing Reading", "Overall Passing Rate"]]
# Format to "\$0,000.00"
school_summary['Total School Budget'] = school_summary['Total School Budget'].map('${:,.2f}'.format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map('${:,.2f}'.format)
school_summary.head()
|
School Type |
Total Students |
Total School Budget |
Per Student Budget |
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| Bailey High School |
District |
4976 |
\$3,124,928.00 |
\$628.00 |
77.048432 |
81.033963 |
66.680064 |
81.933280 |
74.306672 |
| Cabrera High School |
Charter |
1858 |
\$1,081,356.00 |
\$582.00 |
83.061895 |
83.975780 |
94.133477 |
97.039828 |
95.586652 |
| Figueroa High School |
District |
2949 |
\$1,884,411.00 |
\$639.00 |
76.711767 |
81.158020 |
65.988471 |
80.739234 |
73.363852 |
| Ford High School |
District |
2739 |
\$1,763,916.00 |
\$644.00 |
77.102592 |
80.746258 |
68.309602 |
79.299014 |
73.804308 |
| Griffin High School |
Charter |
1468 |
\$917,500.00 |
\$625.00 |
83.351499 |
83.816757 |
93.392371 |
97.138965 |
95.265668 |
# Create top performing schools summary by passing rate
top_summary = school_summary.loc[school_summary['Overall Passing Rate'] > 90]
top_summary.sort_values(['Overall Passing Rate'], ascending=False).head()
|
School Type |
Total Students |
Total School Budget |
Per Student Budget |
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| Cabrera High School |
Charter |
1858 |
\$1,081,356.00 |
\$582.00 |
83.061895 |
83.975780 |
94.133477 |
97.039828 |
95.586652 |
| Thomas High School |
Charter |
1635 |
\$1,043,130.00 |
\$638.00 |
83.418349 |
83.848930 |
93.272171 |
97.308869 |
95.290520 |
| Pena High School |
Charter |
962 |
\$585,858.00 |
\$609.00 |
83.839917 |
84.044699 |
94.594595 |
95.945946 |
95.270270 |
| Griffin High School |
Charter |
1468 |
\$917,500.00 |
\$625.00 |
83.351499 |
83.816757 |
93.392371 |
97.138965 |
95.265668 |
| Wilson High School |
Charter |
2283 |
\$1,319,574.00 |
\$578.00 |
83.274201 |
83.989488 |
93.867718 |
96.539641 |
95.203679 |
# Create bottom performing schools summary by passing rate
bottom_summary = school_summary.loc[school_summary['Overall Passing Rate'] < 75]
bottom_summary.sort_values(['Overall Passing Rate'], ascending=True).head()
|
School Type |
Total Students |
Total School Budget |
Per Student Budget |
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| Rodriguez High School |
District |
3999 |
\$2,547,363.00 |
\$637.00 |
76.842711 |
80.744686 |
66.366592 |
80.220055 |
73.293323 |
| Figueroa High School |
District |
2949 |
\$1,884,411.00 |
\$639.00 |
76.711767 |
81.158020 |
65.988471 |
80.739234 |
73.363852 |
| Huang High School |
District |
2917 |
\$1,910,635.00 |
\$655.00 |
76.629414 |
81.182722 |
65.683922 |
81.316421 |
73.500171 |
| Johnson High School |
District |
4761 |
\$3,094,650.00 |
\$650.00 |
77.072464 |
80.966394 |
66.057551 |
81.222432 |
73.639992 |
| Ford High School |
District |
2739 |
\$1,763,916.00 |
\$644.00 |
77.102592 |
80.746258 |
68.309602 |
79.299014 |
73.804308 |
# Math scores by grade
ninth = students.loc[students['grade']=='9th'].groupby("school")
ninth_math = ninth['math_score'].mean()
tenth = students.loc[students['grade']=='10th'].groupby("school")
tenth_math = tenth['math_score'].mean()
eleventh = students.loc[students['grade']=='11th'].groupby("school")
eleventh_math = eleventh['math_score'].mean()
twelfth = students.loc[students['grade']=='12th'].groupby("school")
twelfth_math = twelfth['math_score'].mean()
# Create dataframe for math scores summary
math_summary = pd.DataFrame({"9th": ninth_math,
"10th": tenth_math,
"11th": eleventh_math,
"12th": twelfth_math})
math_summary = math_summary[["9th","10th","11th","12th"]]
del math_summary.index.name
math_summary.head()
|
9th |
10th |
11th |
12th |
| Bailey High School |
77.083676 |
76.996772 |
77.515588 |
76.492218 |
| Cabrera High School |
83.094697 |
83.154506 |
82.765560 |
83.277487 |
| Figueroa High School |
76.403037 |
76.539974 |
76.884344 |
77.151369 |
| Ford High School |
77.361345 |
77.672316 |
76.918058 |
76.179963 |
| Griffin High School |
82.044010 |
84.229064 |
83.842105 |
83.356164 |
# Reading scores by grade
r_ninth = students.loc[students['grade'] == '9th'].groupby("school")
ninth_read = r_ninth['reading_score'].mean()
r_tenth = students.loc[students['grade'] == '10th'].groupby("school")
tenth_read = r_tenth['reading_score'].mean()
r_eleventh = students.loc[students['grade'] == '11th'].groupby("school")
eleventh_read = r_eleventh['reading_score'].mean()
r_twelfth = students.loc[students['grade'] == '12th'].groupby("school")
twelfth_read = r_twelfth['reading_score'].mean()
# Create dataframe for reading scores summary
read_summary = pd.DataFrame({"9th": ninth_read,
"10th": tenth_read,
"11th": eleventh_read,
"12th": twelfth_read})
read_summary = read_summary[["9th","10th","11th","12th"]]
del read_summary.index.name
read_summary.head()
|
9th |
10th |
11th |
12th |
| Bailey High School |
81.303155 |
80.907183 |
80.945643 |
80.912451 |
| Cabrera High School |
83.676136 |
84.253219 |
83.788382 |
84.287958 |
| Figueroa High School |
81.198598 |
81.408912 |
80.640339 |
81.384863 |
| Ford High School |
80.632653 |
81.262712 |
80.403642 |
80.662338 |
| Griffin High School |
83.369193 |
83.706897 |
84.288089 |
84.013699 |
# Create bins
bins = [0,585,615,645,675]
# Create names for the bins
spending_range = ['<\$585','\$585-615','\$615-645','\$645-675']
# Change formatting of per student budget in school_summary from string back to float so it can be binned
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].str.replace('$', '')
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].astype(float)
school_summary["Spending Ranges (Per Student)"] = pd.cut(school_summary["Per Student Budget"],
bins, labels=spending_range)
spend_summary = school_summary.groupby("Spending Ranges (Per Student)")
spend_summary = spend_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
spend_summary.mean()
|
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| Spending Ranges (Per Student) |
|
|
|
|
|
| <\$585 |
83.455399 |
83.933814 |
93.460096 |
96.610877 |
95.035486 |
| \$585-615 |
83.599686 |
83.885211 |
94.230858 |
95.900287 |
95.065572 |
| \$615-645 |
79.079225 |
81.891436 |
75.668212 |
86.106569 |
80.887391 |
| \$645-675 |
76.997210 |
81.027843 |
66.164813 |
81.133951 |
73.649382 |
# Create bins
bins2 = [0, 1000, 2000, 5000]
# Create names for bins
size_range = ['Small', 'Medium', 'Large']
school_summary["School Size"] = pd.cut(school_summary["Total Students"],
bins2, labels=size_range)
size_summary = school_summary.groupby("School Size")
size_summary = size_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
size_summary.mean()
|
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| School Size |
|
|
|
|
|
| Small |
83.821598 |
83.929843 |
93.550225 |
96.099437 |
94.824831 |
| Medium |
83.374684 |
83.864438 |
93.599695 |
96.790680 |
95.195187 |
| Large |
77.746417 |
81.344493 |
69.963361 |
82.766634 |
76.364998 |
type_summary = school_summary.groupby("School Type")
type_summary = type_summary[["Average Math Score", "Average Reading Score", "% Passing Math",
"% Passing Reading", "Overall Passing Rate"]]
type_summary.mean()
|
Average Math Score |
Average Reading Score |
% Passing Math |
% Passing Reading |
Overall Passing Rate |
| School Type |
|
|
|
|
|
| Charter |
83.473852 |
83.896421 |
93.620830 |
96.586489 |
95.103660 |
| District |
76.956733 |
80.966636 |
66.548453 |
80.799062 |
73.673757 |