import pandas as pd
# Load the dataset from the uploaded file
file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(employee_attrition_data.head())
# Display summary information about the dataset
print("\nSummary Information of the dataset:")
print(employee_attrition_data.info())
# Calculate basic statistics for numerical columns
print("\nBasic Statistics of the dataset:")
print(employee_attrition_data.describe())
First few rows of the dataset:
Employee_ID Age Gender Department Job_Title
Years_at_Company \
0 0 27 Male Marketing Manager 9
1 1 53 Female Sales Engineer 10
2 2 59 Female Marketing Analyst 8
3 3 42 Female Engineering Manager 1
4 4 44 Female Sales Engineer 10
Satisfaction_Level Average_Monthly_Hours Promotion_Last_5Years
Salary \
0 0.586251 151 0
60132
1 0.261161 221 1
79947
2 0.304382 184 0
46958
3 0.480779 242 0
40662
4 0.636244 229 1
74307
Attrition
0 0
1 0
2 1
3 0
4 0
Summary Information of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Employee_ID 1000 non-null int64
1 Age 1000 non-null int64
2 Gender 1000 non-null object
3 Department 1000 non-null object
4 Job_Title 1000 non-null object
5 Years_at_Company 1000 non-null int64
6 Satisfaction_Level 1000 non-null float64
7 Average_Monthly_Hours 1000 non-null int64
8 Promotion_Last_5Years 1000 non-null int64
9 Salary 1000 non-null int64
10 Attrition 1000 non-null int64
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB
None
Basic Statistics of the dataset:
Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 499.500000 42.205000 5.605000 0.505995
std 288.819436 10.016452 2.822223 0.289797
min 0.000000 25.000000 1.000000 0.001376
25% 249.750000 33.000000 3.000000 0.258866
50% 499.500000 43.000000 6.000000 0.505675
75% 749.250000 51.000000 8.000000 0.761135
max 999.000000 59.000000 10.000000 0.999979
Average_Monthly_Hours Promotion_Last_5Years Salary
Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000
import pandas as pd
file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)
# Check for missing values
missing_values = employee_attrition_data.isnull().sum()
print("Missing values in each column:")
print(missing_values)
# One-hot encode categorical variables
encoded_data = pd.get_dummies(employee_attrition_data,
columns=['Gender', 'Department', 'Job_Title'])
# Display the first few rows of the encoded dataset
print("First few rows of the encoded dataset:")
print(encoded_data.head())
Missing values in each column:
Employee_ID 0
Age 0
Gender 0
Department 0
Job_Title 0
Years_at_Company 0
Satisfaction_Level 0
Average_Monthly_Hours 0
Promotion_Last_5Years 0
Salary 0
Attrition 0
dtype: int64
First few rows of the encoded dataset:
Employee_ID Age Years_at_Company Satisfaction_Level \
0 0 27 9 0.586251
1 1 53 10 0.261161
2 2 59 8 0.304382
3 3 42 1 0.480779
4 4 44 10 0.636244
Average_Monthly_Hours Promotion_Last_5Years Salary Attrition \
0 151 0 60132 0
1 221 1 79947 0
2 184 0 46958 1
3 242 0 40662 0
4 229 1 74307 0
Gender_Female Gender_Male Department_Engineering
Department_Finance \
0 False True False
False
1 True False False
False
2 True False False
False
3 True False True
False
4 True False False
False
Department_HR Department_Marketing Department_Sales \
0 False True False
1 False False True
2 False True False
3 False False False
4 False False True
Job_Title_Accountant Job_Title_Analyst Job_Title_Engineer \
0 False False False
1 False False True
2 False True False
3 False False False
4 False False True
Job_Title_HR Specialist Job_Title_Manager
0 False True
1 False False
2 False False
3 False True
4 False False
import matplotlib.pyplot as plt
import seaborn as sns
# Generate summary statistics for all variables
summary_statistics = encoded_data.describe()
print("Summary Statistics:")
print(summary_statistics)
# Histograms for numerical variables
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(encoded_data['Age'], kde=True, ax=axes[0])
axes[0].set_title('Age Distribution')
sns.histplot(encoded_data['Satisfaction_Level'], kde=True, ax=axes[1])
axes[1].set_title('Satisfaction Level Distribution')
sns.histplot(encoded_data['Salary'], kde=True, ax=axes[2])
axes[2].set_title('Salary Distribution')
plt.show()
# Count plots for original categorical variables
fig, axes = plt.subplots(1, 2, figsize=(18, 5))
sns.countplot(data=employee_attrition_data, x='Department',
ax=axes[0])
axes[0].set_title('Department Count')
sns.countplot(data=employee_attrition_data, x='Job_Title', ax=axes[1])
axes[1].set_title('Job Title Count')
plt.show()
# Generate a correlation matrix
correlation_matrix = encoded_data.corr()
# Plot the correlation matrix
plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
Summary Statistics:
Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 499.500000 42.205000 5.605000 0.505995
std 288.819436 10.016452 2.822223 0.289797
min 0.000000 25.000000 1.000000 0.001376
25% 249.750000 33.000000 3.000000 0.258866
50% 499.500000 43.000000 6.000000 0.505675
75% 749.250000 51.000000 8.000000 0.761135
max 999.000000 59.000000 10.000000 0.999979
Average_Monthly_Hours Promotion_Last_5Years Salary
Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# Select features for clustering (excluding the target variable
'Attrition' and identifier 'Employee_ID')
features = encoded_data.drop(columns=['Employee_ID', 'Attrition'])
# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
encoded_data['Cluster'] = kmeans.fit_predict(features)
# Visualize the clusters
plt.figure(figsize=(12, 6))
sns.scatterplot(data=encoded_data, x='Satisfaction_Level',
y='Average_Monthly_Hours', hue='Cluster', palette='viridis')
plt.title('K-means Clustering of Employees')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/
_kmeans.py:870: FutureWarning: The default value of `n_init` will
change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly
to suppress the warning
warnings.warn(
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
# Select features and target
X = encoded_data.drop(columns=['Employee_ID', 'Attrition', 'Cluster'])
y = encoded_data['Attrition']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# Apply logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
# Predict on the test set
y_pred = logreg.predict(X_test)
# Evaluate the model
classification_report_logreg = classification_report(y_test, y_pred)
confusion_matrix_logreg = confusion_matrix(y_test, y_pred)
print("Classification Report:")
print(classification_report_logreg)
print("\nConfusion Matrix:")
print(confusion_matrix_logreg)
Classification Report:
precision recall f1-score support
0 0.51 0.59 0.55 102
1 0.49 0.41 0.44 98
accuracy 0.50 200
macro avg 0.50 0.50 0.49 200
weighted avg 0.50 0.50 0.50 200
Confusion Matrix:
[[60 42]
[58 40]]