import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("./dataset/en_lpor_explorer.csv")

print(data.shape)
data.head()

(649, 31)

data.columns

Index(['School', 'Gender', 'Age', 'Housing_Type', 'Family_Size',
       'Parental_Status', 'Mother_Education', 'Father_Education',
       'Mother_Work', 'Father_Work', 'Reason_School_Choice',
       'Legal_Responsibility', 'Commute_Time', 'Weekly_Study_Time',
       'Extra_Educational_Support', 'Parental_Educational_Support',
       'Private_Tutoring', 'Extracurricular_Activities', 'Attended_Daycare',
       'Desire_Graduate_Education', 'Has_Internet', 'Is_Dating',
       'Good_Family_Relationship', 'Free_Time_After_School',
       'Time_with_Friends', 'Alcohol_Weekdays', 'Alcohol_Weekends',
       'Health_Status', 'School_Absence', 'Grade_1st_Semester',
       'Grade_2nd_Semester'],
      dtype='object')

data = data.drop(columns=[
            'Housing_Type',
            'Family_Size',
            'Father_Education',
            'Mother_Education',
            'Father_Work',
            'Mother_Work',
            'Reason_School_Choice',
            'Commute_Time',
            'Extracurricular_Activities',
            'Attended_Daycare',
            'Desire_Graduate_Education',
            'Free_Time_After_School',
            'Time_with_Friends'
                          ])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   School                        649 non-null    object
 1   Gender                        649 non-null    object
 2   Age                           649 non-null    int64 
 3   Parental_Status               649 non-null    object
 4   Legal_Responsibility          649 non-null    object
 5   Weekly_Study_Time             649 non-null    object
 6   Extra_Educational_Support     649 non-null    object
 7   Parental_Educational_Support  649 non-null    object
 8   Private_Tutoring              649 non-null    object
 9   Has_Internet                  649 non-null    object
 10  Is_Dating                     649 non-null    object
 11  Good_Family_Relationship      649 non-null    object
 12  Alcohol_Weekdays              649 non-null    object
 13  Alcohol_Weekends              649 non-null    object
 14  Health_Status                 649 non-null    object
 15  School_Absence                649 non-null    int64 
 16  Grade_1st_Semester            649 non-null    int64 
 17  Grade_2nd_Semester            649 non-null    int64 
dtypes: int64(4), object(14)
memory usage: 91.4+ KB

data.isna().sum()

School                          0
Gender                          0
Age                             0
Parental_Status                 0
Legal_Responsibility            0
Weekly_Study_Time               0
Extra_Educational_Support       0
Parental_Educational_Support    0
Private_Tutoring                0
Has_Internet                    0
Is_Dating                       0
Good_Family_Relationship        0
Alcohol_Weekdays                0
Alcohol_Weekends                0
Health_Status                   0
School_Absence                  0
Grade_1st_Semester              0
Grade_2nd_Semester              0
dtype: int64

data.describe(include = "all")

data ["Average_Grade"] = data[["Grade_1st_Semester", "Grade_2nd_Semester"]].mean(axis = 1)

cat_cols = [
  'School', 'Gender', 'Age',
  'Parental_Status', 'Legal_Responsibility', "Weekly_Study_Time",
  "Alcohol_Weekdays", 'Alcohol_Weekends', 'Health_Status',
  'Extra_Educational_Support', 'Parental_Educational_Support', 'Private_Tutoring',
  'Is_Dating', 'Has_Internet', 'Good_Family_Relationship'
  ]

fig, axes = plt.subplots(5,3,
figsize = (18, 16))
fig.suptitle("Distribution of Categorical Variables", fontsize = 16)

for ax, col in zip(axes.flat, cat_cols):
  sns.countplot(
    ax = ax,
    x = col,
    data = data,
    order = data[col].value_counts().index
  )

  for container in ax.containers:
    ax.bar_label(container, fmt= '%d', label_type = 'edge', color = 'black')
  
  #ax.set_title(col)

plt.tight_layout()
plt.show()

conditions = [
    (data["Average_Grade"] < 10),
    (data["Average_Grade"] >= 10)
]

values = ["Failed", "Passed"]
data["Pass"] = np.select(conditions, values, default="Unknown")

data.head()

data["Pass"].value_counts()

Pass
Passed    478
Failed    171
Name: count, dtype: int64

sns.countplot(x = "Pass", data=data)

<Axes: xlabel='Pass', ylabel='count'>

fig, axes = plt.subplots(5,3,
figsize = (15, 20))

for ax, col in zip(axes.flat, cat_cols):
  plot = pd.crosstab(
    data[col],
    data['Pass'],
    normalize="index"
  ).plot(kind="bar", stacked=True, ax=ax)
  

  for container in plot.containers:
    plot.bar_label(container, fmt= '%.2f', label_type = 'center', color = 'black')
  
  ax.set_title(f'Pass Rate by {col}')
  ax.set_xlabel(col)
  ax.set_ylabel("Proportion")
  ax.tick_params(axis='x', labelrotation=0)

plt.tight_layout()
plt.show()

plt.show()

if all(c in data.columns for c in ['Alcohol_Weekdays', 'Weekly_Study_Time', 'Average_Grade']):
    fig, ax = plt.subplots(figsize=(8, 5))
    scatter = ax.scatter(
        data['Alcohol_Weekdays'], data['Weekly_Study_Time'],
        c=data['Average_Grade'], cmap='RdYlGn',
        alpha=0.7, edgecolors='white', linewidths=0.3, s=60
    )
    plt.colorbar(scatter, ax=ax, label='Average_Grade')
    ax.set_xlabel('Alcohol Consumption (Weekdays)')
    ax.set_ylabel('Weekly Study Time')
    ax.set_title('Alcohol × Study Time → Academic Grade')
    plt.tight_layout()
    plt.show()

if all(c in data.columns for c in ['Has_Internet', 'Private_Tutoring', 'Average_Grade']):
    pivot = data.pivot_table(
        values='Average_Grade',
        index='Has_Internet',
        columns='Private_Tutoring',
        aggfunc='mean'
    )
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.heatmap(pivot, annot=True, fmt='.2f', cmap='YlGnBu', linewidths=0.5, ax=ax)
    ax.set_title('Average Grade\nInternet Access × Private Tutoring')
    plt.tight_layout()
    plt.show()

if all(c in data.columns for c in ['Good_Family_Relationship', 'Parental_Educational_Support', 'Average_Grade']):
    pivot = data.pivot_table(
        values='Average_Grade',
        index='Good_Family_Relationship',
        columns= 'Parental_Educational_Support',
        aggfunc='mean'
    )
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.heatmap(pivot, annot=True, fmt='.2f', cmap='YlGnBu', linewidths=0.5, ax=ax)
    ax.set_title('Grade Trajectory\nFamily Relationship × Parental Support → Grade Trajectory')
    plt.tight_layout()
    plt.show()

	School	Gender	Age	Housing_Type	Family_Size	Parental_Status	Mother_Education	Father_Education	Mother_Work	Father_Work	...	Is_Dating	Good_Family_Relationship	Free_Time_After_School	Time_with_Friends	Alcohol_Weekdays	Alcohol_Weekends	Health_Status	School_Absence	Grade_1st_Semester	Grade_2nd_Semester
0	Gabriel Pereira	Female	18	Urban	Above 3	Separated	Higher Education	Higher Education	Homemaker	Teacher	...	No	Good	Moderate	High	Very Low	Very Low	Fair	4	0	11
1	Gabriel Pereira	Female	17	Urban	Above 3	Living Together	Primary School	Primary School	Homemaker	other	...	No	Excellent	Moderate	Moderate	Very Low	Very Low	Fair	2	9	11
2	Gabriel Pereira	Female	15	Urban	Up to 3	Living Together	Primary School	Primary School	Homemaker	other	...	No	Good	Moderate	Low	Low	Moderate	Fair	6	12	13
3	Gabriel Pereira	Female	15	Urban	Above 3	Living Together	Higher Education	Lower Secondary School	Health	Services	...	Yes	Fair	Low	Low	Very Low	Very Low	Very Good	0	14	14
4	Gabriel Pereira	Female	16	Urban	Above 3	Living Together	High School	High School	other	other	...	No	Good	Moderate	Low	Very Low	Low	Very Good	0	11	13

	School	Gender	Age	Parental_Status	Legal_Responsibility	Weekly_Study_Time	Extra_Educational_Support	Parental_Educational_Support	Private_Tutoring	Has_Internet	Is_Dating	Good_Family_Relationship	Alcohol_Weekdays	Alcohol_Weekends	Health_Status	School_Absence	Grade_1st_Semester	Grade_2nd_Semester
count	649	649	649.000000	649	649	649	649	649	649	649	649	649	649	649	649	649.000000	649.000000	649.000000
unique	2	2	NaN	2	3	4	2	2	2	2	2	5	5	5	5	NaN	NaN	NaN
top	Gabriel Pereira	Female	NaN	Living Together	Mother	2 to 5h	No	Yes	No	Yes	No	Good	Very Low	Very Low	Very Good	NaN	NaN	NaN
freq	423	383	NaN	569	455	305	581	398	610	498	410	317	451	247	249	NaN	NaN	NaN
mean	NaN	NaN	16.744222	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.659476	11.399076	11.570108
std	NaN	NaN	1.218138	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4.640759	2.745265	2.913639
min	NaN	NaN	15.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	0.000000
25%	NaN	NaN	16.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.000000	10.000000	10.000000
50%	NaN	NaN	17.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.000000	11.000000	11.000000
75%	NaN	NaN	18.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.000000	13.000000	13.000000
max	NaN	NaN	22.000000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	32.000000	19.000000	19.000000

Academic Performance Analysis¶

Loading Data¶

Exploratory Data Analysis¶

Visualization of Categorical Data¶

Calculating Passing Grade¶

Pass Rate by Categorical Values¶

Multivariate Plots¶

Alcohol Consumption × Study Time → Grade¶

Average Grade — Internet Access x Private Tutoring¶

Family Relationship × Parental Support → Grade Trajectory¶

Note:¶