Sample_Discovery
August 6, 2024
[1]: import pandas as pd
# Load the dataset
df = pd.read_csv('messy_employee_data.csv')
# Descriptive Statistics
descriptive_stats = df.describe()
print(descriptive_stats)
Employee_ID Phone_Number Total_Work_Hour_per_Month Salary_per_Month
count 100.000000 9.000000e+01 79.000000 74.000000
mean 5874.450000 5.014917e+09 2631.392405 24305.405405
std 2698.235527 2.946591e+09 4317.578213 37333.773655
min 1139.000000 2.058916e+08 -10.000000 -5000.000000
25% 3639.250000 2.417400e+09 160.000000 8000.000000
50% 6168.000000 4.735941e+09 180.000000 10800.000000
75% 8183.250000 7.711621e+09 5099.500000 12600.000000
max 9957.000000 9.871025e+09 9999.000000 100000.000000
[2]: # Completeness Metric
completeness = df.notnull().mean() * 100
print("Completeness Metric:\n", completeness)
Completeness Metric:
Employee_Name 90.0
Employee_ID 100.0
Job_Role 91.0
Phone_Number 90.0
Email_ID 90.0
Total_Work_Hour_per_Month 79.0
Salary_per_Month 74.0
dtype: float64
[3]: # Accuracy Metric for Salary_per_Month (ensure non-negative values)
accuracy_salary = (df['Salary_per_Month'] >= 0).mean() * 100
print(f"Accuracy for Salary per Month: {accuracy_salary}%")
Accuracy for Salary per Month: 59.0%
1
[4]: # Consistency Metric for Job_Role
valid_job_roles = ['Engineer', 'Data Scientist', 'Manager', 'Analyst',␣
,→'Developer']
consistency_job_role = df['Job_Role'].isin(valid_job_roles).mean() * 100
print(f"Consistency for Job Role: {consistency_job_role}%")
Consistency for Job Role: 91.0%
[5]: # Correlation between Salary and Total Work Hours
correlation = df[['Total_Work_Hour_per_Month', 'Salary_per_Month']].corr()
print("Correlation:\n", correlation)
Correlation:
Total_Work_Hour_per_Month Salary_per_Month
Total_Work_Hour_per_Month 1.000000 0.600796
Salary_per_Month 0.600796 1.000000
[6]: import seaborn as sns
import matplotlib.pyplot as plt
# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()
2
[7]: # Missing Data Analysis
missing_values_count = df.isnull().sum()
#missing_data_pattern = df.isnull()
print("Missing Values Count:\n", missing_values_count)
#print("Missing Data Pattern:\n", missing_data_pattern)
Missing Values Count:
Employee_Name 10
Employee_ID 0
Job_Role 9
Phone_Number 10
Email_ID 10
Total_Work_Hour_per_Month 21
Salary_per_Month 26
dtype: int64
3
[14]: # Fill missing values
df['Employee_Name'].fillna('Unknown', inplace=True)
df['Employee_ID'].fillna('Unknown', inplace=True)
df['Phone_Number'].replace('unknown', None, inplace=True)
df['Phone_Number'].fillna('Unknown', inplace=True)
df['Total_Work_Hour_per_Month'].fillna(df['Total_Work_Hour_per_Month'].mean(),␣
,→inplace=True)
df['Salary_per_Month'].fillna(df['Salary_per_Month'].mean(), inplace=True)
# Correct data types
df['Employee_ID'] = df['Employee_ID'].astype(str)
df['Total_Work_Hour_per_Month'] = df['Total_Work_Hour_per_Month'].astype(float)
df['Salary_per_Month'] = df['Salary_per_Month'].astype(float)
#df['Phone_Number']= df['Phone_Number'].astype(str)
# Normalize Email IDs
df['Email_ID'] = df['Email_ID'].apply(lambda x: x if pd.isna(x) or '@' in x else␣
,→x + '@example.com')
# Ensure consistent job role naming
df['Job_Role'] = df['Job_Role'].str.title()
# Format phone numbers (dummy formatting for demonstration)
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x if pd.isna(x) or x ==␣
,→'Unknown' else x.replace('-', ''))
df['Phone_Number'] = df['Phone_Number'].astype(str)
# Ensure Employee Names are title case
df['Employee_Name'] = df['Employee_Name'].str.title()
# Remove duplicate rows (if any)
df.drop_duplicates(inplace=True)
print("Structured and Formatted Dataset:\n", df)
Structured and Formatted Dataset:
Employee_Name Employee_ID Job_Role Phone_Number Email_ID
\
0 Ospjpqptpe 9516 Manager 4733377351.0 ohvoj@sample.org
1 Arwarmgzmo 8444 Manager Unknown zsphp@example.com
2 Qwxbncqkag 2420 Manager Unknown tqhfs@example.com
3 Akmmthjndy 3445 Developer 4901793467.0 ljnkm@sample.org
4 Croxaopkbi 9378 Manager 8679802795.0 aqrpm@test.net
.. ... ... ... ... ...
95 Nsgdvhcolz 3727 Analyst 3957223288.0 hgnyu@sample.org
96 Unknown 3857 Data Scientist 7050739609.0 sttlw@sample.org
97 Vbowqqbmye 2467 Data Scientist 8921949055.0 vdszq@example.com
98 Kukowpctzv 3553 Developer 9197584574.0 NaN
4
99 Pvvicpbxnk 9638 Analyst 3176133724.0 ghjrh@example.com
Total_Work_Hour_per_Month Salary_per_Month
0 -10.000000 24305.405405
1 9999.000000 -5000.000000
2 180.000000 10800.000000
3 200.000000 12000.000000
4 180.000000 10800.000000
.. ... ...
95 200.000000 10000.000000
96 160.000000 11200.000000
97 160.000000 8000.000000
98 2631.392405 12000.000000
99 180.000000 10800.000000
[100 rows x 7 columns]
[17]: df['Employee_Name'].dtype
[17]: dtype('O')
[ ]: