05-plotting-summary-statistics.py 4.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
"""Calculate student grades by combining data from many sources.

Using Pandas, this script combines data from the:

* Roster
* Homework & Exam grades
* Quiz grades

to calculate final grades for a class.
"""
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
15
import scipy.stats
16 17 18 19

HERE = Path(__file__).parent
DATA_FOLDER = HERE / "data"

20 21 22 23
# ----------------------
# 01 - LOADING THE DATA
# ----------------------

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
roster = pd.read_csv(
    DATA_FOLDER / "roster.csv",
    converters={"NetID": str.lower, "Email Address": str.lower},
    usecols=["Section", "Email Address", "NetID"],
    index_col="NetID",
)

hw_exam_grades = pd.read_csv(
    DATA_FOLDER / "hw_exam_grades.csv",
    converters={"SID": str.lower, "Email Address": str.lower},
    usecols=lambda x: "Submission" not in x,
    index_col="SID",
)

quiz_grades = pd.DataFrame()
39 40
for file_path in DATA_FOLDER.glob("quiz_*_grades.csv"):
    quiz_name = " ".join(file_path.stem.title().split("_")[:2])
41
    quiz = pd.read_csv(
42
        file_path,
43 44 45 46 47 48
        converters={"Email": str.lower},
        index_col=["Email"],
        usecols=["Email", "Grade"],
    ).rename(columns={"Grade": quiz_name})
    quiz_grades = pd.concat([quiz_grades, quiz], axis=1)

49 50 51 52
# ------------------------
# 02 - MERGING DATAFRAMES
# ------------------------

53 54 55 56 57 58 59 60
final_data = pd.merge(
    roster, hw_exam_grades, left_index=True, right_index=True,
)
final_data = pd.merge(
    final_data, quiz_grades, left_on="Email Address", right_index=True
)
final_data = final_data.fillna(0)

61 62 63 64
# ------------------------
# 03 - CALCULATING GRADES
# ------------------------

65 66 67 68 69 70 71 72 73 74 75 76 77 78
n_exams = 3
for n in range(1, n_exams + 1):
    final_data[f"Exam {n} Score"] = (
        final_data[f"Exam {n}"] / final_data[f"Exam {n} - Max Points"]
    )

homework_scores = final_data.filter(regex=r"^Homework \d\d?$", axis=1)
homework_max_points = final_data.filter(regex=r"^Homework \d\d? -", axis=1)

sum_of_hw_scores = homework_scores.sum(axis=1)
sum_of_hw_max = homework_max_points.sum(axis=1)
final_data["Total Homework"] = sum_of_hw_scores / sum_of_hw_max

hw_max_renamed = homework_max_points.set_axis(homework_scores.columns, axis=1)
79 80
average_hw_scores = (homework_scores / hw_max_renamed).sum(axis=1)
final_data["Average Homework"] = average_hw_scores / homework_scores.shape[1]
81 82

final_data["Homework Score"] = final_data[
83
    ["Total Homework", "Average Homework"]
84 85 86 87 88 89 90 91 92 93 94
].max(axis=1)

quiz_scores = final_data.filter(regex=r"^Quiz \d$", axis=1)
quiz_max_points = pd.Series(
    {"Quiz 1": 11, "Quiz 2": 15, "Quiz 3": 17, "Quiz 4": 14, "Quiz 5": 12}
)

sum_of_quiz_scores = quiz_scores.sum(axis=1)
sum_of_quiz_max = quiz_max_points.sum()
final_data["Total Quizzes"] = sum_of_hw_scores / sum_of_hw_max

95 96
average_quiz_scores = (quiz_scores / quiz_max_points).sum(axis=1)
final_data["Average Quizzes"] = average_quiz_scores / quiz_scores.shape[1]
97 98

final_data["Quiz Score"] = final_data[
99
    ["Total Quizzes", "Average Quizzes"]
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
].max(axis=1)

weightings = pd.Series(
    {
        "Exam 1 Score": 0.05,
        "Exam 2 Score": 0.1,
        "Exam 3 Score": 0.15,
        "Quiz Score": 0.30,
        "Homework Score": 0.4,
    }
)

final_data["Final Score"] = (final_data[weightings.index] * weightings).sum(
    axis=1
)
final_data["Ceiling Score"] = np.ceil(final_data["Final Score"] * 100)

grades = {
    90: "A",
    80: "B",
    70: "C",
    60: "D",
    0: "F",
}


def grade_mapping(value):
127
    """Map numerical grade to letter grade."""
128 129 130 131 132 133 134 135 136 137
    for key, letter in grades.items():
        if value >= key:
            return letter


letter_grades = final_data["Ceiling Score"].map(grade_mapping)
final_data["Final Grade"] = pd.Categorical(
    letter_grades, categories=grades.values(), ordered=True
)

138 139 140 141
# -----------------------
# 04 - GROUPING THE DATA
# -----------------------

142
for section, table in final_data.groupby("Section"):
143 144 145 146 147
    section_file = DATA_FOLDER / f"Section {section} Grades.csv"
    num_students = table.shape[0]
    print(
        f"In Section {section} there are {num_students} students saved to "
        f"file {section_file}."
148
    )
149 150 151 152 153
    table.sort_values(by=["Last Name", "First Name"]).to_csv(section_file)

# ---------------------------------
# 05 - PLOTTING SUMMARY STATISTICS
# ---------------------------------
154 155 156 157 158 159 160 161 162 163 164 165 166

grade_counts = final_data["Final Grade"].value_counts().sort_index()
grade_counts.plot.bar()
plt.show()

final_data["Final Score"].plot.hist(bins=20, label="Histogram")
final_data["Final Score"].plot.density(
    linewidth=4, label="Kernel Density Estimate"
)

final_mean = final_data["Final Score"].mean()
final_std = final_data["Final Score"].std()
x = np.linspace(final_mean - 5 * final_std, final_mean + 5 * final_std, 200)
167
normal_dist = scipy.stats.norm.pdf(x, loc=final_mean, scale=final_std)
168 169 170
plt.plot(x, normal_dist, label="Normal Distribution", linewidth=4)
plt.legend()
plt.show()