import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from dmba import regressionSummary
from dmba import backward_elimination, stepwise_selection
from dmba import adjusted_r2_score, AIC_score
import statsmodels.formula.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
%matplotlib inline
df = pd.read_csv('Placement_Data_Full_Class.csv')
df.index = df['sl_no']
df.drop('sl_no', axis=1, inplace=True)
df['salary'] = df['salary'] * 0.013
dummies = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']
df_clean = pd.get_dummies(df, columns=dummies, drop_first=True)
columns = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
'specialisation_Mkt&HR', 'status_Placed', 'salary']
df_clean = df_clean.reindex(columns=columns)
df_clean_nonans = df_clean.dropna()
df_nonans = df.dropna()
male_df = df_clean_nonans[df_clean_nonans['gender_M'] == 1]
female_df = df_clean_nonans[df_clean_nonans['gender_M'] == 0]
# hypothesis testing salary means between male and female groups
print('Mean of salary for male: ', male_df['salary'].mean())
print('Mean of salary for female: ', female_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(male_df['salary'], female_df['salary']))
Mean of salary for male: 3885.83 Mean of salary for female: 3474.7916666666665 Hypothesis test for difference between means: Ttest_indResult(statistic=1.9448521615505516, pvalue=0.053714668806405554)
ssc_b_Others_df = df_clean_nonans[df_clean_nonans['ssc_b_Others'] == 1]
ssc_b_Central_df = df_clean_nonans[df_clean_nonans['ssc_b_Others'] == 0]
# hypothesis testing salary means between Others and Central groups
print('Mean of salary for Others: ', ssc_b_Others_df['salary'].mean())
print('Mean of salary for Central: ', ssc_b_Central_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(ssc_b_Others_df['salary'], ssc_b_Central_df['salary']))
Mean of salary for Others: 3759.6 Mean of salary for Central: 3746.1666666666665 Hypothesis test for difference between means: Ttest_indResult(statistic=0.06692931610519544, pvalue=0.9467295407874518)
hsc_b_Others_df = df_clean_nonans[df_clean_nonans['hsc_b_Others'] == 1]
hsc_b_Central_df = df_clean_nonans[df_clean_nonans['hsc_b_Others'] == 0]
# hypothesis testing salary means between Others and Central groups
print('Mean of salary for Others: ', hsc_b_Others_df['salary'].mean())
print('Mean of salary for Central: ', hsc_b_Central_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_b_Others_df['salary'], hsc_b_Central_df['salary']))
Mean of salary for Others: 3745.285714285714 Mean of salary for Central: 3764.0701754385964 Hypothesis test for difference between means: Ttest_indResult(statistic=-0.09122184544225741, pvalue=0.9274413548271774)
hsc_s_Com_df = df_nonans[df_nonans['hsc_s'] == 'Commerce']
hsc_s_Sci_df = df_nonans[df_nonans['hsc_s'] == 'Science']
hsc_s_Art_df = df_nonans[df_nonans['hsc_s'] == 'Arts']
# hypothesis testing salary means between Commerce and Science groups
print('Mean of salary for Commerce: ', hsc_s_Com_df['salary'].mean())
print('Mean of salary for Science: ', hsc_s_Sci_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Com_df['salary'], hsc_s_Sci_df['salary']))
print()
# hypothesis testing salary means between Commerce and Arts groups
print('Mean of salary for Commerce: ', hsc_s_Com_df['salary'].mean())
print('Mean of salary for Arts: ', hsc_s_Art_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Com_df['salary'], hsc_s_Art_df['salary']))
print()
# hypothesis testing salary means between Science and Arts groups
print('Mean of salary for Science: ', hsc_s_Sci_df['salary'].mean())
print('Mean of salary for Arts: ', hsc_s_Art_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(hsc_s_Sci_df['salary'], hsc_s_Art_df['salary']))
Mean of salary for Commerce: 3736.4303797468356 Mean of salary for Science: 3822.2063492063494 Hypothesis test for difference between means: Ttest_indResult(statistic=-0.4102884448701634, pvalue=0.68222156865267) Mean of salary for Commerce: 3736.4303797468356 Mean of salary for Arts: 3232.6666666666665 Hypothesis test for difference between means: Ttest_indResult(statistic=0.9417798090868945, pvalue=0.3490384779982475) Mean of salary for Science: 3822.2063492063494 Mean of salary for Arts: 3232.6666666666665 Hypothesis test for difference between means: Ttest_indResult(statistic=1.2384203806946261, pvalue=0.21988183557319033)
degree_t_CM_df = df_nonans[df_nonans['degree_t'] == 'Comm&Mgmt']
degree_t_O_df = df_nonans[df_nonans['degree_t'] == 'Others']
degree_t_ST_df = df_nonans[df_nonans['degree_t'] == 'Sci&Tech']
# hypothesis testing salary means between Comm&Mgmt and Others groups
print('Mean of salary for Comm&Mgmt: ', degree_t_CM_df['salary'].mean())
print('Mean of salary for Others: ', degree_t_O_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_CM_df['salary'], degree_t_O_df['salary']))
print()
# hypothesis testing salary means between Comm&Mgmt and Sci&Tech groups
print('Mean of salary for Comm&Mgmt: ', degree_t_CM_df['salary'].mean())
print('Mean of salary for Sci&Tech: ', degree_t_ST_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_CM_df['salary'], degree_t_ST_df['salary']))
print()
# hypothesis testing salary means between Sci&Tech and Others groups
print('Mean of salary for Sci&Tech: ', degree_t_ST_df['salary'].mean())
print('Mean of salary for Others: ', degree_t_O_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(degree_t_ST_df['salary'], degree_t_O_df['salary']))
Mean of salary for Comm&Mgmt: 3622.156862745098 Mean of salary for Others: 3645.2 Hypothesis test for difference between means: Ttest_indResult(statistic=-0.04313075692071769, pvalue=0.9656792154031896) Mean of salary for Comm&Mgmt: 3622.156862745098 Mean of salary for Sci&Tech: 4089.9268292682927 Hypothesis test for difference between means: Ttest_indResult(statistic=-2.0786256077393412, pvalue=0.03946403084269494) Mean of salary for Sci&Tech: 4089.9268292682927 Mean of salary for Others: 3645.2 Hypothesis test for difference between means: Ttest_indResult(statistic=0.7476534084682858, pvalue=0.45864660838868276)
workex_Y_df = df_clean_nonans[df_clean_nonans['workex_Yes'] == 1]
workex_N_df = df_clean_nonans[df_clean_nonans['workex_Yes'] == 0]
# hypothesis testing salary means between Work Experience and No Work Experience groups
print('Mean of salary for Work Experience: ', workex_Y_df['salary'].mean())
print('Mean of salary for No Work Experience: ', workex_N_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(workex_Y_df['salary'], workex_N_df['salary']))
Mean of salary for Work Experience: 3942.453125 Mean of salary for No Work Experience: 3607.809523809524 Hypothesis test for difference between means: Ttest_indResult(statistic=1.6701337722667704, pvalue=0.09703543621755682)
MKHR_df = df_clean_nonans[df_clean_nonans['specialisation_Mkt&HR'] == 1]
MKFN_df = df_clean_nonans[df_clean_nonans['specialisation_Mkt&HR'] == 0]
# hypothesis testing salary means between Mkt&HR and Mkt&Fin groups
print('Mean of salary for Mkt&HR: ', MKHR_df['salary'].mean())
print('Mean of salary for Mkt&Fin: ', MKFN_df['salary'].mean())
print('Hypothesis test for difference between means: ')
print(stats.ttest_ind(MKHR_df['salary'], MKFN_df['salary']))
Mean of salary for Mkt&HR: 3514.9056603773583 Mean of salary for Mkt&Fin: 3885.084210526316 Hypothesis test for difference between means: Ttest_indResult(statistic=-1.790427661842333, pvalue=0.07545741069325944)
# functions needed for backward elimination
def train_model(variables):
model = LinearRegression()
model.fit(X_train[variables], y_train)
return model
def score_model(model, variables):
return AIC_score(y_train, model.predict(X_train[variables]), model)
features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
'specialisation_Mkt&HR']
X = df_clean_nonans[features]
y = df_clean_nonans['salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=717)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(103, 14) (45, 14) (103,) (45,)
# run backward elimination
best_model, best_variables = backward_elimination(X_train.columns, train_model, score_model, verbose=True)
print(best_variables)
print(regressionSummary(y_test, best_model.predict(X_test[best_variables])))
Variables: ssc_p, hsc_p, degree_p, etest_p, mba_p, gender_M, ssc_b_Others, hsc_b_Others, hsc_s_Commerce, hsc_s_Science, degree_t_Others, degree_t_Sci&Tech, workex_Yes, specialisation_Mkt&HR Start: score=1791.67 Step: score=1789.70, remove etest_p Step: score=1787.76, remove specialisation_Mkt&HR Step: score=1785.95, remove ssc_b_Others Step: score=1784.16, remove degree_t_Others Step: score=1782.40, remove hsc_s_Science Step: score=1780.58, remove hsc_b_Others Step: score=1779.57, remove degree_p Step: score=1778.57, remove hsc_p Step: score=1778.34, remove ssc_p Step: score=1778.28, remove workex_Yes Step: score=1778.28, remove None ['mba_p', 'gender_M', 'hsc_s_Commerce', 'degree_t_Sci&Tech'] Regression statistics Mean Error (ME) : -349.6807 Root Mean Squared Error (RMSE) : 832.5384 Mean Absolute Error (MAE) : 684.3140 Mean Percentage Error (MPE) : -13.0372 Mean Absolute Percentage Error (MAPE) : 20.2176 None
# show full model results with statsmodels
train_df = X_train.join(y_train)
train_df.rename(columns={"degree_t_Sci&Tech": "degree_t_SciTech"}, inplace=True)
best_variables = ['mba_p', 'gender_M', 'hsc_s_Commerce', 'degree_t_SciTech']
formula = 'salary ~ ' + ' + '.join(best_variables)
lm = sm.ols(formula=formula, data=train_df).fit()
sns.lmplot(x='mba_p', y='salary', data=train_df)
plt.xlabel('MBA Percentage')
plt.ylabel('Salary')
plt.title('MBA Percentage Linear Regression on Salary')
plt.show()
sns.lmplot(x='gender_M', y='salary', data=train_df)
plt.xlabel('Male Gender')
plt.ylabel('Salary')
plt.title('Male Gender Linear Regression on Salary')
plt.show()
sns.lmplot(x='hsc_s_Commerce', y='salary', data=train_df)
plt.xlabel('Higher Secondary School Commerce Specialization')
plt.ylabel('Salary')
plt.title('Higher Secondary School Commerce Specialization Linear Regression on Salary')
plt.show()
sns.lmplot(x='degree_t_SciTech', y='salary', data=train_df)
plt.xlabel('Undergrad Degree in Sci&Tech')
plt.ylabel('Salary')
plt.title('Undergrad Degree in Sci&Tech Linear Regression on Salary')
plt.show()
features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'gender_M',
'ssc_b_Others', 'hsc_b_Others', 'hsc_s_Commerce', 'hsc_s_Science',
'degree_t_Others', 'degree_t_Sci&Tech', 'workex_Yes',
'specialisation_Mkt&HR']
X = df_clean[features]
y = df_clean['status_Placed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=717)
# Decision tree
tree = DecisionTreeClassifier(criterion='gini', random_state=717, max_depth=4, min_samples_leaf=5)
tree.fit(X_train, y_train)
# predict
y_pred = tree.predict(X_test)
# evaluate
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
print("Accuracy")
print(accuracy_score(y_test, y_pred) * 100)
print("Report")
print(classification_report(y_test, y_pred))
# feature importance top 10
feature_imp = pd.Series(tree.feature_importances_, index=features).sort_values(ascending=False)
print('Feature Importance')
print(feature_imp)
Confusion Matrix [[13 4] [ 8 40]] Accuracy 81.53846153846153 Report precision recall f1-score support 0 0.62 0.76 0.68 17 1 0.91 0.83 0.87 48 accuracy 0.82 65 macro avg 0.76 0.80 0.78 65 weighted avg 0.83 0.82 0.82 65 Feature Importance ssc_p 0.536245 hsc_p 0.197759 mba_p 0.180012 degree_p 0.085984 etest_p 0.000000 gender_M 0.000000 ssc_b_Others 0.000000 hsc_b_Others 0.000000 hsc_s_Commerce 0.000000 hsc_s_Science 0.000000 degree_t_Others 0.000000 degree_t_Sci&Tech 0.000000 workex_Yes 0.000000 specialisation_Mkt&HR 0.000000 dtype: float64
# draw decision tree
plt.style.use('classic')
label_names = {1: 'Placed', 0: 'Not Placed'}
fig = plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=features, class_names=label_names)
plt.show()