[go: up one dir, main page]

0% found this document useful (0 votes)
15 views5 pages

Sample Discovery

asdfghjkl;'
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views5 pages

Sample Discovery

asdfghjkl;'
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

Sample_Discovery

August 6, 2024

[1]: import pandas as pd

# Load the dataset


df = pd.read_csv('messy_employee_data.csv')

# Descriptive Statistics
descriptive_stats = df.describe()
print(descriptive_stats)

Employee_ID Phone_Number Total_Work_Hour_per_Month Salary_per_Month


count 100.000000 9.000000e+01 79.000000 74.000000
mean 5874.450000 5.014917e+09 2631.392405 24305.405405
std 2698.235527 2.946591e+09 4317.578213 37333.773655
min 1139.000000 2.058916e+08 -10.000000 -5000.000000
25% 3639.250000 2.417400e+09 160.000000 8000.000000
50% 6168.000000 4.735941e+09 180.000000 10800.000000
75% 8183.250000 7.711621e+09 5099.500000 12600.000000
max 9957.000000 9.871025e+09 9999.000000 100000.000000

[2]: # Completeness Metric


completeness = df.notnull().mean() * 100
print("Completeness Metric:\n", completeness)

Completeness Metric:
Employee_Name 90.0
Employee_ID 100.0
Job_Role 91.0
Phone_Number 90.0
Email_ID 90.0
Total_Work_Hour_per_Month 79.0
Salary_per_Month 74.0
dtype: float64

[3]: # Accuracy Metric for Salary_per_Month (ensure non-negative values)


accuracy_salary = (df['Salary_per_Month'] >= 0).mean() * 100
print(f"Accuracy for Salary per Month: {accuracy_salary}%")

Accuracy for Salary per Month: 59.0%

1
[4]: # Consistency Metric for Job_Role
valid_job_roles = ['Engineer', 'Data Scientist', 'Manager', 'Analyst',␣
,→'Developer']

consistency_job_role = df['Job_Role'].isin(valid_job_roles).mean() * 100


print(f"Consistency for Job Role: {consistency_job_role}%")

Consistency for Job Role: 91.0%

[5]: # Correlation between Salary and Total Work Hours


correlation = df[['Total_Work_Hour_per_Month', 'Salary_per_Month']].corr()
print("Correlation:\n", correlation)

Correlation:
Total_Work_Hour_per_Month Salary_per_Month
Total_Work_Hour_per_Month 1.000000 0.600796
Salary_per_Month 0.600796 1.000000

[6]: import seaborn as sns


import matplotlib.pyplot as plt

# Plot the heatmap


plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()

2
[7]: # Missing Data Analysis
missing_values_count = df.isnull().sum()
#missing_data_pattern = df.isnull()
print("Missing Values Count:\n", missing_values_count)
#print("Missing Data Pattern:\n", missing_data_pattern)

Missing Values Count:


Employee_Name 10
Employee_ID 0
Job_Role 9
Phone_Number 10
Email_ID 10
Total_Work_Hour_per_Month 21
Salary_per_Month 26
dtype: int64

3
[14]: # Fill missing values
df['Employee_Name'].fillna('Unknown', inplace=True)
df['Employee_ID'].fillna('Unknown', inplace=True)
df['Phone_Number'].replace('unknown', None, inplace=True)
df['Phone_Number'].fillna('Unknown', inplace=True)
df['Total_Work_Hour_per_Month'].fillna(df['Total_Work_Hour_per_Month'].mean(),␣
,→inplace=True)

df['Salary_per_Month'].fillna(df['Salary_per_Month'].mean(), inplace=True)

# Correct data types


df['Employee_ID'] = df['Employee_ID'].astype(str)
df['Total_Work_Hour_per_Month'] = df['Total_Work_Hour_per_Month'].astype(float)
df['Salary_per_Month'] = df['Salary_per_Month'].astype(float)
#df['Phone_Number']= df['Phone_Number'].astype(str)
# Normalize Email IDs
df['Email_ID'] = df['Email_ID'].apply(lambda x: x if pd.isna(x) or '@' in x else␣
,→x + '@example.com')

# Ensure consistent job role naming


df['Job_Role'] = df['Job_Role'].str.title()

# Format phone numbers (dummy formatting for demonstration)


df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x if pd.isna(x) or x ==␣
,→'Unknown' else x.replace('-', ''))

df['Phone_Number'] = df['Phone_Number'].astype(str)
# Ensure Employee Names are title case
df['Employee_Name'] = df['Employee_Name'].str.title()

# Remove duplicate rows (if any)


df.drop_duplicates(inplace=True)

print("Structured and Formatted Dataset:\n", df)

Structured and Formatted Dataset:


Employee_Name Employee_ID Job_Role Phone_Number Email_ID
\
0 Ospjpqptpe 9516 Manager 4733377351.0 ohvoj@sample.org
1 Arwarmgzmo 8444 Manager Unknown zsphp@example.com
2 Qwxbncqkag 2420 Manager Unknown tqhfs@example.com
3 Akmmthjndy 3445 Developer 4901793467.0 ljnkm@sample.org
4 Croxaopkbi 9378 Manager 8679802795.0 aqrpm@test.net
.. ... ... ... ... ...
95 Nsgdvhcolz 3727 Analyst 3957223288.0 hgnyu@sample.org
96 Unknown 3857 Data Scientist 7050739609.0 sttlw@sample.org
97 Vbowqqbmye 2467 Data Scientist 8921949055.0 vdszq@example.com
98 Kukowpctzv 3553 Developer 9197584574.0 NaN

4
99 Pvvicpbxnk 9638 Analyst 3176133724.0 ghjrh@example.com

Total_Work_Hour_per_Month Salary_per_Month
0 -10.000000 24305.405405
1 9999.000000 -5000.000000
2 180.000000 10800.000000
3 200.000000 12000.000000
4 180.000000 10800.000000
.. ... ...
95 200.000000 10000.000000
96 160.000000 11200.000000
97 160.000000 8000.000000
98 2631.392405 12000.000000
99 180.000000 10800.000000

[100 rows x 7 columns]

[17]: df['Employee_Name'].dtype

[17]: dtype('O')

[ ]:

You might also like