8/15/23, 11:02 PM Walmart.
ipynb - Colaboratory
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data=pd.read_csv("https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/293/original/walmart_data.csv?1641285094")
data.head(5)
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Produc
0-
0 1000001 P00069042 F 10 A 2 0
17
0-
1 1000001 P00248942 F 10 A 2 0
17
0-
2 1000001 P00087842 F 10 A 2 0
17
0-
3 1000001 P00085442 F 10 A 2 0
data.columns
Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category',
'Purchase'],
dtype='object')
data.shape
(550068, 10)
data.describe()
User_ID Occupation Marital_Status Product_Category Purchase
count 5.500680e+05 550068.000000 550068.000000 550068.000000 550068.000000
mean 1.003029e+06 8.076707 0.409653 5.404270 9263.968713
std 1.727592e+03 6.522660 0.491770 3.936211 5023.065394
min 1.000001e+06 0.000000 0.000000 1.000000 12.000000
25% 1.001516e+06 2.000000 0.000000 1.000000 5823.000000
50% 1.003077e+06 7.000000 0.000000 5.000000 8047.000000
75% 1.004478e+06 14.000000 1.000000 8.000000 12054.000000
max 1.006040e+06 20.000000 1.000000 20.000000 23961.000000
sns.boxplot(x=data["Gender"],y=data["Purchase"])
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 1/6
8/15/23, 11:02 PM Walmart.ipynb - Colaboratory
<Axes: xlabel='Gender', ylabel='Purchase'>
data["Gender"].value_counts()
M 414259
F 135809
Name: Gender, dtype: int64
data["Marital_Status"].value_counts()
0 324731
1 225337
Name: Marital_Status, dtype: int64
pd.crosstab(data["Gender"],data["Purchase"],margins=True)
Purchase 12 13 14 24 25 26 36 37 38 48 ... 23952 23953 23954 23955 23956 23958 23959 239
Gender
F 27 25 30 28 30 27 36 31 34 33 ... 0 0 0 1 0 0 1
M 74 81 65 90 83 85 71 79 80 75 ... 1 2 2 2 1 4 1
All 101 106 95 118 113 112 107 110 114 108 ... 1 2 2 3 1 4 2
3 rows × 18106 columns
## Women are not spending more than man
## Univariante Analysis
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Purchase', kde=True)
plt.show()
sns.boxplot(data=data, x='Purchase', orient='h')
plt.show()
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 2/6
8/15/23, 11:02 PM Walmart.ipynb - Colaboratory
amt_df = data.groupby(['User_ID', 'Gender'])[['Purchase']].sum()
amt_df = amt_df.reset_index()
amt_df
User_ID Gender Purchase
0 1000001 F 334093
1 1000002 M 810472
2 1000003 M 341635
3 1000004 M 206468
4 1000005 M 821001
... ... ... ...
5886 1006036 F 4116058
5887 1006037 F 1119538
5888 1006038 F 90034
5889 1006039 F 590319
5890 1006040 M 1653299
5891 rows × 3 columns
amt_df[amt_df['Gender']=='M']['Purchase'].hist(bins=35)
plt.show()
amt_df[amt_df['Gender']=='F']['Purchase'].hist(bins=35)
plt.show()
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 3/6
8/15/23, 11:02 PM Walmart.ipynb - Colaboratory
male_avg = amt_df[amt_df['Gender']=='M']['Purchase'].mean()
female_avg = amt_df[amt_df['Gender']=='F']['Purchase'].mean()
print("Average amount spend by Male customers: {:.2f}".format(male_avg))
print("Average amount spend by Female customers: {:.2f}".format(female_avg))
Average amount spend by Male customers: 925344.40
Average amount spend by Female customers: 712024.39
male_df = amt_df[amt_df['Gender']=='M']
female_df = amt_df[amt_df['Gender']=='F']
genders = ["M", "F"]
male_sample_size = 3000
female_sample_size = 1500
num_repitions = 1000
male_means = []
female_means = []
for _ in range(num_repitions):
male_mean = male_df.sample(male_sample_size, replace=True)['Purchase'].mean()
female_mean = female_df.sample(female_sample_size, replace=True)['Purchase'].mean()
male_means.append(male_mean)
female_means.append(female_mean)
fig, axis = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
axis[0].hist(male_means, bins=35)
axis[1].hist(female_means, bins=35)
axis[0].set_title("Male - Distribution of means, Sample size: 3000")
axis[1].set_title("Female - Distribution of means, Sample size: 1500")
plt.show()
print("Population mean - Mean of sample means of amount spend for Male: {:.2f}".format(np.mean(male_means)))
print("Population mean - Mean of sample means of amount spend for Female: {:.2f}".format(np.mean(female_means)))
print("\nMale - Sample mean: {:.2f} Sample std: {:.2f}".format(male_df['Purchase'].mean(), male_df['Purchase'].std()))
print("Female - Sample mean: {:.2f} Sample std: {:.2f}".format(female_df['Purchase'].mean(), female_df['Purchase'].std()))
Population mean - Mean of sample means of amount spend for Male: 925203.96
Population mean - Mean of sample means of amount spend for Female: 712059.99
Male - Sample mean: 925344.40 Sample std: 985830.10
Female - Sample mean: 712024.39 Sample std: 807370.73
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 4/6
8/15/23, 11:02 PM Walmart.ipynb - Colaboratory
male_margin_of_error_clt = 1.96*male_df['Purchase'].std()/np.sqrt(len(male_df))
male_sample_mean = male_df['Purchase'].mean()
male_lower_lim = male_sample_mean - male_margin_of_error_clt
male_upper_lim = male_sample_mean + male_margin_of_error_clt
female_margin_of_error_clt = 1.96*female_df['Purchase'].std()/np.sqrt(len(female_df))
female_sample_mean = female_df['Purchase'].mean()
female_lower_lim = female_sample_mean - female_margin_of_error_clt
female_upper_lim = female_sample_mean + female_margin_of_error_clt
print("Male confidence interval of means: ({:.2f}, {:.2f})".format(male_lower_lim, male_upper_lim))
print("Female confidence interval of means: ({:.2f}, {:.2f})".format(female_lower_lim, female_upper_lim))
Male confidence interval of means: (895617.83, 955070.97)
Female confidence interval of means: (673254.77, 750794.02)
amt_df = data.groupby(['User_ID', 'Age'])[['Purchase']].sum()
amt_df = amt_df.reset_index()
amt_df
User_ID Age Purchase
0 1000001 0-17 334093
1 1000002 55+ 810472
2 1000003 26-35 341635
3 1000004 46-50 206468
4 1000005 26-35 821001
... ... ... ...
5886 1006036 26-35 4116058
5887 1006037 46-50 1119538
5888 1006038 55+ 90034
5889 1006039 46-50 590319
5890 1006040 26-35 1653299
5891 rows × 3 columns
amt_df['Age'].value_counts()
26-35 2053
36-45 1167
18-25 1069
46-50 531
51-55 481
55+ 372
0-17 218
Name: Age, dtype: int64
sample_size = 200
num_repitions = 1000
all_means = {}
age_intervals = ['26-35', '36-45', '18-25', '46-50', '51-55', '55+', '0-17']
for age_interval in age_intervals:
all_means[age_interval] = []
for age_interval in age_intervals:
for _ in range(num_repitions):
mean = amt_df[amt_df['Age']==age_interval].sample(sample_size, replace=True)['Purchase'].mean()
all_means[age_interval].append(mean)
for val in ['26-35', '36-45', '18-25', '46-50', '51-55', '55+', '0-17']:
new_df = amt_df[amt_df['Age']==val]
margin_of_error_clt = 1.96*new_df['Purchase'].std()/np.sqrt(len(new_df))
sample_mean = new_df['Purchase'].mean()
lower_lim = sample_mean - margin_of_error_clt
upper_lim = sample_mean + margin_of_error_clt
print("For age {} --> confidence interval of means: ({:.2f}, {:.2f})".format(val, lower_lim, upper_lim))
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 5/6
8/15/23, 11:02 PM Walmart.ipynb - Colaboratory
For age 26-35 --> confidence interval of means: (945034.42, 1034284.21)
For age 36-45 --> confidence interval of means: (823347.80, 935983.62)
For age 18-25 --> confidence interval of means: (801632.78, 908093.46)
For age 46-50 --> confidence interval of means: (713505.63, 871591.93)
For age 51-55 --> confidence interval of means: (692392.43, 834009.42)
For age 55+ --> confidence interval of means: (476948.26, 602446.23)
For age 0-17 --> confidence interval of means: (527662.46, 710073.17)
check 0s completed at 11:00 PM
https://colab.research.google.com/drive/18DgebZPGy-tMWk9OnkIC91-iKYDk6ymX#scrollTo=d6vbi-rQOC15&printMode=true 6/6