import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from dmba import regressionSummary
from dmba import backward_elimination, stepwise_selection
from dmba import adjusted_r2_score, AIC_score
import statsmodels.formula.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
%matplotlib inline


df = pd.read_csv('Placement_Data_Full_Class.csv')


df.index = df['sl_no']
df.drop('sl_no', axis=1, inplace=True)


df['salary'] = df['salary'] * 0.013


dummies = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']
df_clean = pd.get_dummies(df, columns=dummies, drop_first=True)


columns = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
       'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
       'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
       'specialisation_Mkt&HR', 'status_Placed', 'salary']
df_clean = df_clean.reindex(columns=columns)


df_clean_nonans = df_clean.dropna()
df_nonans = df.dropna()


male_df = df_clean_nonans[df_clean_nonans['gender_M'] == 1]
female_df = df_clean_nonans[df_clean_nonans['gender_M'] == 0]
# hypothesis testing salary means between male and female groups
print('Mean of salary for male: ', male_df['salary'].mean())
print('Mean of salary for female: ', female_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(male_df['salary'], female_df['salary']))

Mean of salary for male:  3885.83
Mean of salary for female:  3474.7916666666665
Hypothesis test for difference between means: 
Ttest_indResult(statistic=1.9448521615505516, pvalue=0.053714668806405554)


ssc_b_Others_df = df_clean_nonans[df_clean_nonans['ssc_b_Others'] == 1]
ssc_b_Central_df = df_clean_nonans[df_clean_nonans['ssc_b_Others'] == 0]
# hypothesis testing salary means between Others and Central groups
print('Mean of salary for Others: ', ssc_b_Others_df['salary'].mean())
print('Mean of salary for Central: ', ssc_b_Central_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(ssc_b_Others_df['salary'], ssc_b_Central_df['salary']))

Mean of salary for Others:  3759.6
Mean of salary for Central:  3746.1666666666665
Hypothesis test for difference between means: 
Ttest_indResult(statistic=0.06692931610519544, pvalue=0.9467295407874518)


hsc_b_Others_df = df_clean_nonans[df_clean_nonans['hsc_b_Others'] == 1]
hsc_b_Central_df = df_clean_nonans[df_clean_nonans['hsc_b_Others'] == 0]
# hypothesis testing salary means between Others and Central groups
print('Mean of salary for Others: ', hsc_b_Others_df['salary'].mean())
print('Mean of salary for Central: ', hsc_b_Central_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_b_Others_df['salary'], hsc_b_Central_df['salary']))

Mean of salary for Others:  3745.285714285714
Mean of salary for Central:  3764.0701754385964
Hypothesis test for difference between means: 
Ttest_indResult(statistic=-0.09122184544225741, pvalue=0.9274413548271774)


hsc_s_Com_df = df_nonans[df_nonans['hsc_s'] == 'Commerce']
hsc_s_Sci_df = df_nonans[df_nonans['hsc_s'] == 'Science']
hsc_s_Art_df = df_nonans[df_nonans['hsc_s'] == 'Arts']

# hypothesis testing salary means between Commerce and Science groups
print('Mean of salary for Commerce: ', hsc_s_Com_df['salary'].mean())
print('Mean of salary for Science: ', hsc_s_Sci_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Com_df['salary'], hsc_s_Sci_df['salary']))
print()
# hypothesis testing salary means between Commerce and Arts groups
print('Mean of salary for Commerce: ', hsc_s_Com_df['salary'].mean())
print('Mean of salary for Arts: ', hsc_s_Art_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Com_df['salary'], hsc_s_Art_df['salary']))
print()
# hypothesis testing salary means between Science and Arts groups
print('Mean of salary for Science: ', hsc_s_Sci_df['salary'].mean())
print('Mean of salary for Arts: ', hsc_s_Art_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Sci_df['salary'], hsc_s_Art_df['salary']))

Mean of salary for Commerce:  3736.4303797468356
Mean of salary for Science:  3822.2063492063494
Hypothesis test for difference between means: 
Ttest_indResult(statistic=-0.4102884448701634, pvalue=0.68222156865267)

Mean of salary for Commerce:  3736.4303797468356
Mean of salary for Arts:  3232.6666666666665
Hypothesis test for difference between means: 
Ttest_indResult(statistic=0.9417798090868945, pvalue=0.3490384779982475)

Mean of salary for Science:  3822.2063492063494
Mean of salary for Arts:  3232.6666666666665
Hypothesis test for difference between means: 
Ttest_indResult(statistic=1.2384203806946261, pvalue=0.21988183557319033)


degree_t_CM_df = df_nonans[df_nonans['degree_t'] == 'Comm&Mgmt']
degree_t_O_df = df_nonans[df_nonans['degree_t'] == 'Others']
degree_t_ST_df = df_nonans[df_nonans['degree_t'] == 'Sci&Tech']

# hypothesis testing salary means between Comm&Mgmt and Others groups
print('Mean of salary for Comm&Mgmt: ', degree_t_CM_df['salary'].mean())
print('Mean of salary for Others: ', degree_t_O_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_CM_df['salary'], degree_t_O_df['salary']))
print()
# hypothesis testing salary means between Comm&Mgmt and Sci&Tech groups
print('Mean of salary for Comm&Mgmt: ', degree_t_CM_df['salary'].mean())
print('Mean of salary for Sci&Tech: ', degree_t_ST_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_CM_df['salary'], degree_t_ST_df['salary']))
print()
# hypothesis testing salary means between Sci&Tech and Others groups
print('Mean of salary for Sci&Tech: ', degree_t_ST_df['salary'].mean())
print('Mean of salary for Others: ', degree_t_O_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_ST_df['salary'], degree_t_O_df['salary']))

Mean of salary for Comm&Mgmt:  3622.156862745098
Mean of salary for Others:  3645.2
Hypothesis test for difference between means: 
Ttest_indResult(statistic=-0.04313075692071769, pvalue=0.9656792154031896)

Mean of salary for Comm&Mgmt:  3622.156862745098
Mean of salary for Sci&Tech:  4089.9268292682927
Hypothesis test for difference between means: 
Ttest_indResult(statistic=-2.0786256077393412, pvalue=0.03946403084269494)

Mean of salary for Sci&Tech:  4089.9268292682927
Mean of salary for Others:  3645.2
Hypothesis test for difference between means: 
Ttest_indResult(statistic=0.7476534084682858, pvalue=0.45864660838868276)


workex_Y_df = df_clean_nonans[df_clean_nonans['workex_Yes'] == 1]
workex_N_df = df_clean_nonans[df_clean_nonans['workex_Yes'] == 0]
# hypothesis testing salary means between Work Experience and No Work Experience groups
print('Mean of salary for Work Experience: ', workex_Y_df['salary'].mean())
print('Mean of salary for No Work Experience: ', workex_N_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(workex_Y_df['salary'], workex_N_df['salary']))

Mean of salary for Work Experience:  3942.453125
Mean of salary for No Work Experience:  3607.809523809524
Hypothesis test for difference between means: 
Ttest_indResult(statistic=1.6701337722667704, pvalue=0.09703543621755682)


MKHR_df = df_clean_nonans[df_clean_nonans['specialisation_Mkt&HR'] == 1]
MKFN_df = df_clean_nonans[df_clean_nonans['specialisation_Mkt&HR'] == 0]
# hypothesis testing salary means between Mkt&HR and Mkt&Fin groups
print('Mean of salary for Mkt&HR: ', MKHR_df['salary'].mean())
print('Mean of salary for Mkt&Fin: ', MKFN_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(MKHR_df['salary'], MKFN_df['salary']))

Mean of salary for Mkt&HR:  3514.9056603773583
Mean of salary for Mkt&Fin:  3885.084210526316
Hypothesis test for difference between means: 
Ttest_indResult(statistic=-1.790427661842333, pvalue=0.07545741069325944)


# functions needed for backward elimination

def train_model(variables):
    model = LinearRegression()
    model.fit(X_train[variables], y_train)
    return model

def score_model(model, variables):
    return AIC_score(y_train, model.predict(X_train[variables]), model)


features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
       'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
       'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
       'specialisation_Mkt&HR']

X = df_clean_nonans[features]
y = df_clean_nonans['salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=717)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(103, 14)
(45, 14)
(103,)
(45,)


# run backward elimination
best_model, best_variables = backward_elimination(X_train.columns, train_model, score_model, verbose=True)

print(best_variables)
print(regressionSummary(y_test, best_model.predict(X_test[best_variables])))

Variables: ssc_p, hsc_p, degree_p, etest_p, mba_p, gender_M, ssc_b_Others, hsc_b_Others, hsc_s_Commerce, hsc_s_Science, degree_t_Others, degree_t_Sci&Tech, workex_Yes, specialisation_Mkt&HR
Start: score=1791.67
Step: score=1789.70, remove etest_p
Step: score=1787.76, remove specialisation_Mkt&HR
Step: score=1785.95, remove ssc_b_Others
Step: score=1784.16, remove degree_t_Others
Step: score=1782.40, remove hsc_s_Science
Step: score=1780.58, remove hsc_b_Others
Step: score=1779.57, remove degree_p
Step: score=1778.57, remove hsc_p
Step: score=1778.34, remove ssc_p
Step: score=1778.28, remove workex_Yes
Step: score=1778.28, remove None
['mba_p', 'gender_M', 'hsc_s_Commerce', 'degree_t_Sci&Tech']

Regression statistics

                      Mean Error (ME) : -349.6807
       Root Mean Squared Error (RMSE) : 832.5384
            Mean Absolute Error (MAE) : 684.3140
          Mean Percentage Error (MPE) : -13.0372
Mean Absolute Percentage Error (MAPE) : 20.2176
None


# show full model results with statsmodels

train_df = X_train.join(y_train)
train_df.rename(columns={"degree_t_Sci&Tech": "degree_t_SciTech"}, inplace=True)
best_variables = ['mba_p', 'gender_M', 'hsc_s_Commerce', 'degree_t_SciTech']
formula = 'salary ~ ' + ' + '.join(best_variables)

lm = sm.ols(formula=formula, data=train_df).fit()


sns.lmplot(x='mba_p', y='salary', data=train_df)
plt.xlabel('MBA Percentage')
plt.ylabel('Salary')
plt.title('MBA Percentage Linear Regression on Salary')
plt.show()


sns.lmplot(x='gender_M', y='salary', data=train_df)
plt.xlabel('Male Gender')
plt.ylabel('Salary')
plt.title('Male Gender Linear Regression on Salary')
plt.show()


sns.lmplot(x='hsc_s_Commerce', y='salary', data=train_df)
plt.xlabel('Higher Secondary School Commerce Specialization')
plt.ylabel('Salary')
plt.title('Higher Secondary School Commerce Specialization Linear Regression on Salary')
plt.show()


sns.lmplot(x='degree_t_SciTech', y='salary', data=train_df)
plt.xlabel('Undergrad Degree in Sci&Tech')
plt.ylabel('Salary')
plt.title('Undergrad Degree in Sci&Tech Linear Regression on Salary')
plt.show()


features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
       'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
       'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
       'specialisation_Mkt&HR']

X = df_clean[features]
y = df_clean['status_Placed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=717)


# Decision tree
tree = DecisionTreeClassifier(criterion='gini', random_state=717, max_depth=4, min_samples_leaf=5)
tree.fit(X_train, y_train)

# predict
y_pred = tree.predict(X_test)

# evaluate
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("Accuracy")
print(accuracy_score(y_test, y_pred) * 100)

print("Report")
print(classification_report(y_test, y_pred))

# feature importance top 10
feature_imp = pd.Series(tree.feature_importances_, index=features).sort_values(ascending=False)
print('Feature Importance')
print(feature_imp)

Confusion Matrix
[[13  4]
 [ 8 40]]
Accuracy
81.53846153846153
Report
              precision    recall  f1-score   support

           0       0.62      0.76      0.68        17
           1       0.91      0.83      0.87        48

    accuracy                           0.82        65
   macro avg       0.76      0.80      0.78        65
weighted avg       0.83      0.82      0.82        65

Feature Importance
ssc_p                    0.536245
hsc_p                    0.197759
mba_p                    0.180012
degree_p                 0.085984
etest_p                  0.000000
gender_M                 0.000000
ssc_b_Others             0.000000
hsc_b_Others             0.000000
hsc_s_Commerce           0.000000
hsc_s_Science            0.000000
degree_t_Others          0.000000
degree_t_Sci&Tech        0.000000
workex_Yes               0.000000
specialisation_Mkt&HR    0.000000
dtype: float64


# draw decision tree
plt.style.use('classic')
label_names = {1: 'Placed', 0: 'Not Placed'}
fig = plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=features, class_names=label_names)
plt.show()

Statistical Analysis¶

Gender¶

Lower Secondary Education¶

Higher Secondary Education¶

Specialisation¶

Undergraduate Education¶

Work Experience and Non work Experience¶

Specialisation MKT and HR¶

Linear Regression on salary variable¶

Backward Elimination¶

Linear Regression on Salary¶

Male Gender Linear Regression on Salary¶

Higher Secondary School Commerce Specialisation Linear Regression on Salary¶

Undergraduate Degree in Science & Tech Linear Regression on Salary¶

Classification of 'status_Placed' Variable¶

Decision Tree¶