week 1
2025-08-12
library(tidyverse)
library(magrittr) #pipping operator %>%
library(tinytex) #knittopdf
library(nycflights13) #datasetsforflightsandplanes
library(readxl) #readexcelfiles
library(Hmisc) #impute()
library(skimr) #skim()
library(kableExtra)
df <- read.csv("superstore.csv")
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2017-114412 15-04-17 20-04-17 Standard Class AA-10480
## 2 2 US-2017-156909 16-07-17 18-07-17 Second Class SF-20065
## 3 3 CA-2017-107727 19-10-17 23-10-17 Second Class MA-17560
## 4 4 CA-2017-120999 10-09-17 15-09-17 Standard Class LC-16930
## 5 5 CA-2017-139619 19-09-17 23-09-17 Standard Class ES-14080
## 6 6 CA-2017-114440 14-09-17 17-09-17 Second Class TB-21520
## Customer.Name State Postal.Code Region Product.ID
## 1 Andrew Allen North Carolina 28027 South OFF-PA-10002365
## 2 Sandra Flanagan Pennsylvania 19140 East FUR-CH-10002774
## 3 Matt Abelman Texas 77095 Central OFF-PA-10000249
## 4 Linda Cazamias Illinois 60540 Central TEC-PH-10004093
## 5 Erin Smith Florida 32935 South OFF-ST-10003282
## 6 Tracy Blumstein Michigan 49201 Central OFF-PA-10004675
## Category Sub.Category
## 1 Office Supplies Paper
## 2 Furniture Chairs
## 3 Office Supplies Paper
## 4 Technology Phones
## 5 Office Supplies Storage
## 6 Office Supplies Paper
## Product.Name
## 1 Xerox 1967
## 2 Global Deluxe Stacking Chair, Gray
## 3 Easy-staple paper
## 4 Panasonic Kx-TS550
## 5 Advantus 10-Drawer Portable Organizer, Chrome Metal Frame, Smoke Drawers
## 6 Telephone Message Books with Fax/Mobile Section, 5 1/2" x 3 3/16"
## Sales Quantity Discount Profit
## 1 15.552 3 0.2 5.4432
## 2 71.372 2 0.3 -1.0196
## 3 29.472 3 0.2 9.9468
1
## 4 147.168 4 0.2 16.5564
## 5 95.616 2 0.2 9.5616
## 6 19.050 3 0.0 8.7630
df.subset <- df %>% select(Category, Region, Sales, Quantity, Profit) #subset columns
head(df.subset)
## Category Region Sales Quantity Profit
## 1 Office Supplies South 15.552 3 5.4432
## 2 Furniture East 71.372 2 -1.0196
## 3 Office Supplies Central 29.472 3 9.9468
## 4 Technology Central 147.168 4 16.5564
## 5 Office Supplies South 95.616 2 9.5616
## 6 Office Supplies Central 19.050 3 8.7630
df.central <- df %>% filter(Region == "Central") #subset rows
str(df.central)
## ’data.frame’: 761 obs. of 18 variables:
## $ Row.ID : int 3 4 6 7 8 9 10 12 13 29 ...
## $ Order.ID : chr "CA-2017-107727" "CA-2017-120999" "CA-2017-114440" "US-2017-118038" ...
## $ Order.Date : chr "19-10-17" "10-09-17" "14-09-17" "09-12-17" ...
## $ Ship.Date : chr "23-10-17" "15-09-17" "17-09-17" "11-12-17" ...
## $ Ship.Mode : chr "Second Class" "Standard Class" "Second Class" "First Class" ...
## $ Customer.ID : chr "MA-17560" "LC-16930" "TB-21520" "KB-16600" ...
## $ Customer.Name: chr "Matt Abelman" "Linda Cazamias" "Tracy Blumstein" "Ken Brennan" ...
## $ State : chr "Texas" "Illinois" "Michigan" "Texas" ...
## $ Postal.Code : int 77095 60540 49201 77041 77041 77041 60623 55901 55901 64055 ...
## $ Region : chr "Central" "Central" "Central" "Central" ...
## $ Product.ID : chr "OFF-PA-10000249" "TEC-PH-10004093" "OFF-PA-10004675" "OFF-BI-10004182" ...
## $ Category : chr "Office Supplies" "Technology" "Office Supplies" "Office Supplies" ...
## $ Sub.Category : chr "Paper" "Phones" "Paper" "Binders" ...
## $ Product.Name : chr "Easy-staple paper" "Panasonic Kx-TS550" "Telephone Message Books with Fax/Mob
## $ Sales : num 29.47 147.17 19.05 1.25 9.71 ...
## $ Quantity : int 3 4 3 3 3 3 3 1 2 3 ...
## $ Discount : num 0.2 0.2 0 0.8 0.6 0.2 0.2 0 0 0 ...
## $ Profit : num 9.95 16.56 8.76 -1.93 -5.82 ...
df.cost <- df %>% mutate(Cost = Sales - Profit)
head(df.cost)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2017-114412 15-04-17 20-04-17 Standard Class AA-10480
## 2 2 US-2017-156909 16-07-17 18-07-17 Second Class SF-20065
## 3 3 CA-2017-107727 19-10-17 23-10-17 Second Class MA-17560
## 4 4 CA-2017-120999 10-09-17 15-09-17 Standard Class LC-16930
## 5 5 CA-2017-139619 19-09-17 23-09-17 Standard Class ES-14080
## 6 6 CA-2017-114440 14-09-17 17-09-17 Second Class TB-21520
## Customer.Name State Postal.Code Region Product.ID
## 1 Andrew Allen North Carolina 28027 South OFF-PA-10002365
## 2 Sandra Flanagan Pennsylvania 19140 East FUR-CH-10002774
## 3 Matt Abelman Texas 77095 Central OFF-PA-10000249
2
## 4 Linda Cazamias Illinois 60540 Central TEC-PH-10004093
## 5 Erin Smith Florida 32935 South OFF-ST-10003282
## 6 Tracy Blumstein Michigan 49201 Central OFF-PA-10004675
## Category Sub.Category
## 1 Office Supplies Paper
## 2 Furniture Chairs
## 3 Office Supplies Paper
## 4 Technology Phones
## 5 Office Supplies Storage
## 6 Office Supplies Paper
## Product.Name
## 1 Xerox 1967
## 2 Global Deluxe Stacking Chair, Gray
## 3 Easy-staple paper
## 4 Panasonic Kx-TS550
## 5 Advantus 10-Drawer Portable Organizer, Chrome Metal Frame, Smoke Drawers
## 6 Telephone Message Books with Fax/Mobile Section, 5 1/2" x 3 3/16"
## Sales Quantity Discount Profit Cost
## 1 15.552 3 0.2 5.4432 10.1088
## 2 71.372 2 0.3 -1.0196 72.3916
## 3 29.472 3 0.2 9.9468 19.5252
## 4 147.168 4 0.2 16.5564 130.6116
## 5 95.616 2 0.2 9.5616 86.0544
## 6 19.050 3 0.0 8.7630 10.2870
df.summarise <- df %>% group_by(Ship.Mode) %>% summarise(AverageProfit = mean(Profit))
head(df.summarise)
## # A tibble: 4 x 2
## Ship.Mode AverageProfit
## <chr> <dbl>
## 1 First Class 35.7
## 2 Same Day 18.1
## 3 Second Class 30.5
## 4 Standard Class 26.3
Quiz 1
df1 <- read.csv("hr.csv")
head(df1)
## name empid maritalstatusid performancescore salary
## 1 Adinolfi, Wilson K 10026 0 4 62506
## 2 Ait Sidi, Karthikeyan 10084 1 3 104437
## 3 Akinkuolie, Sarah 10196 1 3 64955
## 4 Alagbe,Trina 10088 1 3 64991
## 5 Anderson, Carol 10069 2 3 50825
## 6 Anderson, Linda 10002 0 4 57568
## position dob gender dateofhire dateoftermination
## 1 Production Technician I 07-10-83 M 07-05-11
## 2 Sr. DBA 05-05-75 M 3/30/2015 6/16/2016
## 3 Production Technician II 09/19/88 F 07-05-11 9/24/2012
## 4 Production Technician I 09/27/88 F 01-07-08
3
## 5 Production Technician I 09-08-89 F 07-11-11 09-06-16
## 6 Production Technician I 05/22/77 F 01-09-12
## terminationreason employmentstatus department managername
## 1 N/A-StillEmployed Active Production Michael Albert
## 2 career change Voluntarily Terminated IT/IS Simon Roup
## 3 hours Voluntarily Terminated Production Kissy Sullivan
## 4 N/A-StillEmployed Active Production Elijiah Gray
## 5 return to school Voluntarily Terminated Production Webster Butler
## 6 N/A-StillEmployed Active Production Amy Dunn
## jobsatisfactionscore absences
## 1 4.60 1
## 2 4.96 17
## 3 3.02 3
## 4 4.84 15
## 5 5.00 2
## 6 5.00 15
terminated <- df1 %>% filter(employmentstatus == "Terminated for Cause") %>% summarise(mean_salary = mea
terminated
## mean_salary
## 1 75264.88