[go: up one dir, main page]

0% found this document useful (0 votes)
106 views24 pages

Output

The document loads data on bike rentals from a CSV file and performs exploratory data analysis. It cleans the data by recoding variables, calculates summary statistics, and creates visualizations to analyze trends in bike rentals by season, weather conditions, and other factors. Key findings include that total rentals are highest in summer and lowest in spring, and that warmer temperatures and dry weather conditions correspond to more bike rentals on average.

Uploaded by

Karan Khanna
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
106 views24 pages

Output

The document loads data on bike rentals from a CSV file and performs exploratory data analysis. It cleans the data by recoding variables, calculates summary statistics, and creates visualizations to analyze trends in bike rentals by season, weather conditions, and other factors. Key findings include that total rentals are highest in summer and lowest in spring, and that warmer temperatures and dry weather conditions correspond to more bike rentals on average.

Uploaded by

Karan Khanna
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 24

> setwd("G:\\Management-IMT\\Analytics\\code")

>
> #loading the required libraries
> library(rpart)
> library(rattle)
> library(rpart.plot)
> library(beanplot)
> library(dplyr)
> library(RColorBrewer)
> library(randomForest)
> library(magrittr)
>
> # reading the data files
> bike=read.csv("Bike Rental Data.csv", header = TRUE)
>
> # Get the feel about the data
> str(bike)
'data.frame': 17379 obs. of 17 variables:
$ instant : int 1 2 3 4 5 6 7 8 9 10 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..: 1 1 1 1
1 1 1 1 1 1 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ yr : int 0 0 0 0 0 0 0 0 0 0 ...
$ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
$ hr : int 0 1 2 3 4 5 6 7 8 9 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
$ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
$ weathersit: int 1 1 1 1 1 2 1 1 1 1 ...
$ temp : num 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
$ atemp : num 0.288 0.273 0.273 0.288 0.288 ...
$ hum : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
$ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
$ casual : int 3 8 5 3 0 0 2 1 1 8 ...
$ registered: int 13 32 27 10 1 1 0 2 7 6 ...
$ cnt : int 16 40 32 13 1 1 2 3 8 14 ...
> summary(bike)
instant dteday season yr
mnth
Min. : 1 2011-01-01: 24 Min. :1.000 Min. :0.0000 Min.
: 1.000
1st Qu.: 4346 2011-01-08: 24 1st Qu.:2.000 1st Qu.:0.0000 1st
Qu.: 4.000
Median : 8690 2011-01-09: 24 Median :3.000 Median :1.0000
Median : 7.000
Mean : 8690 2011-01-10: 24 Mean :2.502 Mean :0.5026 Mean
: 6.538
3rd Qu.:13034 2011-01-13: 24 3rd Qu.:3.000 3rd Qu.:1.0000 3rd
Qu.:10.000
Max. :17379 2011-01-15: 24 Max. :4.000 Max. :1.0000 Max.
:12.000
(Other) :17235
hr holiday weekday workingday
weathersit
Min. : 0.00 Min. :0.00000 Min. :0.000 Min. :0.0000 Min.
:1.000
1st Qu.: 6.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st
Qu.:1.000
Median :12.00 Median :0.00000 Median :3.000 Median :1.0000 Median
:1.000
Mean :11.55 Mean :0.02877 Mean :3.004 Mean :0.6827 Mean
:1.425
3rd Qu.:18.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.0000 3rd
Qu.:2.000
Max. :23.00 Max. :1.00000 Max. :6.000 Max. :1.0000 Max.
:4.000

temp atemp hum windspeed


casual
Min. :0.020 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min.
: 0.00
1st Qu.:0.340 1st Qu.:0.3333 1st Qu.:0.4800 1st Qu.:0.1045 1st
Qu.: 4.00
Median :0.500 Median :0.4848 Median :0.6300 Median :0.1940 Median
: 17.00
Mean :0.497 Mean :0.4758 Mean :0.6272 Mean :0.1901 Mean
: 35.68
3rd Qu.:0.660 3rd Qu.:0.6212 3rd Qu.:0.7800 3rd Qu.:0.2537 3rd
Qu.: 48.00
Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :0.8507 Max.
:367.00

registered cnt
Min. : 0.0 Min. : 1.0
1st Qu.: 34.0 1st Qu.: 40.0
Median :115.0 Median :142.0
Mean :153.8 Mean :189.5
3rd Qu.:220.0 3rd Qu.:281.0
Max. :886.0 Max. :977.0

> dim(bike)
[1] 17379 17
> names(bike)
[1] "instant" "dteday" "season" "yr" "mnth" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"hum" "windspeed"
[15] "casual" "registered" "cnt"
>
> # are there any missing values in the data?
> table(is.na(bike))

FALSE
295443
>
> # Recode Season values 1-4 to Spring, Summer, Fall, Winter
>
> # Function to recode values
> recodev <- function(original.vector, old.values, new.values) {
+ new.vector <- original.vector
+ for (i in 1:length(old.values)) {
+ change.log <- original.vector == old.values[i] &
+ is.na(original.vector) == F
+ new.vector[change.log] <- new.values[i]
+ }
+ return(new.vector)
+ }
>
> # apply recodev function for recoding season values
> bike$season <- recodev(original.vector = data$season,
+ old.values = c(1:4),
+ new.values = c("Spring","Summer","Fall",
+ "Winter"))
Error in data$season : object of type 'closure' is not subsettable
> table(bike$season)

1 2 3 4
4242 4409 4496 4232
>
> # What is the most popular season for bike rentals
> boxplot(count ~ season,
+ data = bike,
+ xlab = "Season",
+ ylab = "Count",
+ main = "Count by Season",
+ col = "yellow3")
Error in model.frame.default(formula = count ~ season, data = bike) :
object is not a matrix
>
> # check seasons and respective months
> unique(bike$month[bike$season=="Spring"])
NULL
> unique(bike$month[bike$season=="Summer"])
NULL
> unique(bike$month[bike$season=="Fall"])
NULL
> unique(bike$month[bike$season=="Winter"])
NULL
>
> # temp : Normalized temperature in Celsius. The values are derived via
(t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
> # atemp: Normalized feeling temperature in Celsius. The values are
derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly
scale)
> # temp and atemp are in normalized form
> # we need to denormalize it before using
>
> # create a function for denormalise temp, atemp
> tconvert <- function(min, max, vector){
+ result <- vector * (max - min) + min
+ return (result)
+ }
>
> # apply the function and denormalise the temperature values
> bike$temp <- tconvert(-8, 39, bike$temp)
> bike$atemp <- tconvert(-16, 50, bike$atemp)
>
> # calculate mean, std.dev and median for each season
> season_agg <- bike %>%
+ group_by(season) %>%
+ summarise(
+ temp.min = min(temp),
+ temp.max = max(temp),
+ temp.med = median(temp),
+ temp.stdev = sd(temp),
+ temp.mean = mean(temp),
+ count = n())
> season_agg
# A tibble: 4 x 7
season temp.min temp.max temp.med temp.stdev temp.mean count
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 -7.06 25.8 5.16 5.58 6.06 4242
2 2 -0.480 36.2 18.3 6.54 17.6 4409
3 3 9.86 39 24.9 4.41 25.2 4496
4 4 -1.42 27.7 11.7 5.74 11.9 4232
>
> # create a boxplot for temperature by season
> boxplot(temp ~ season,
+ data = bike,
+ xlab = "Season",
+ ylab = "Temperature",
+ main = "Temperature by Season",
+ col = "skyblue")
>
> # rename columns yr and mnth.. just to improve readability
> names(bike)[4:5] <- c("year","month")
>
> # recode year values to 2011, 2012
> bike$year <- recodev(original.vector = bike$year,
+ old.values = c(0,1),
+ new.values = c(2011,2012))
> # check column names
> names(bike)
[1] "instant" "dteday" "season" "year" "month" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"hum" "windspeed"
[15] "casual" "registered" "cnt"
>
> # Similarly rename hum and cnt to meaningful names
> names(bike)[names(bike)=="hum"] <- "humidity"
> names(bike)[names(bike)=="cnt"] <- "count"
> names(bike)
[1] "instant" "dteday" "season" "year" "month" "hr"
"holiday"
[8] "weekday" "workingday" "weathersit" "temp" "atemp"
"humidity" "windspeed"
[15] "casual" "registered" "count"
>
> # understand the distribution of numerical variables..
> # setting margins for the graphs
> par(mfrow=c(4,2))
> par(mar = rep(2, 4))
> hist(bike$weather)
> hist(bike$humidity)
> hist(bike$holiday)
> hist(bike$workingday)
> hist(bike$temp)
> hist(bike$atemp)
> hist(bike$windspeed)
>
> # For which weather condition the number of total bike rentals are the
lowest/highest?
>
> # create a beanplot for number of bike rents per each weather condition
> par(mfrow=c(1,1))
> par(mar=c(4,4,4,4))
> bean.cols <- lapply(brewer.pal(6, "Set3"),
+ function(x){return(c(x, "black", "gray", "red"))})
> beanplot(count ~ weathersit,
+ data = bike,
+ main = "Bike Rents by Weather Condition",
+ xlab = "Weather Condition",
+ ylab = "Number of rentals",
+ col = bean.cols,
+ lwd = 1,
+ what = c (1,1,1,0),
+ log = ""
+ )
>
> # What is the mean, median and standard deviation for total number of
rentals
> # (count) per season?
> # create a data frame
> df <- data.frame(Spring = rep(NA, 3),
+ Winter = rep(NA, 3),
+ Summer = rep(NA, 3),
+ Fall = rep(NA, 3))
> row.names(df) <- c("mean", "median", "sd")
>
> # fill the data frame with corresponding mean, median and sd values
> vec <- c ("mean","median","sd")
> for (n in vec){
+ for (i in unique(bike$season)) {
+ my.fun <- get(n)
+ res <- my.fun(bike$count[bike$season == i])
+ df[n,i] <- res
+ }
+ }
> df
Spring Winter Summer Fall
mean 111.1146 208.3441 236.0162 198.8689
median 76.0000 165.0000 199.0000 155.5000
sd 119.2240 188.3625 197.7116 182.9680
>
> # ANOVA of season
> summary(aov(count ~ season, data = bike))
Df Sum Sq Mean Sq F value Pr(>F)
season 1 18127040 18127040 569 <2e-16 ***
Residuals 17377 553634551 31860
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # pairwise comparison of means for seasons in order to identify any
difference between
> # two means that is greater than the expected standard error
> TukeyHSD(aov(count ~ season, data = bike))
Error in TukeyHSD.aov(aov(count ~ season, data = bike)) :
no factors in the fitted model
In addition: Warning message:
In replications(paste("~", xx), data = mf) : non-factors ignored: season
>
> # correlation test for count~atemp
> t1 <- cor.test(bike$atemp[bike$year == 2011],
+ bike$count[bike$year == 2011])
> t1

Pearson's product-moment correlation

data: bike$atemp[bike$year == 2011] and bike$count[bike$year == 2011]


t = 46.46, df = 8643, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4300004 0.4637388
sample estimates:
cor
0.4470285

>
> t2 <- cor.test(bike$atemp[bike$year == 2012],
+ bike$count[bike$year == 2012])
> t2

Pearson's product-moment correlation

data: bike$atemp[bike$year == 2012] and bike$count[bike$year == 2012]


t = 40.346, df = 8732, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3785679 0.4139248
sample estimates:
cor
0.3963933

>
> # plot the results in a scatterplot with regression lines
>
> # blank plot
> plot(x = 1,
+ xlab = "Temperature",
+ ylab = "Number of Rents",
+ xlim = c(-25,50),
+ ylim = c(0,1000),
+ main = "Temperature vs. Count")
>
> # draw points for 2011 year
> points(x = bike$atemp[bike$year == 2011],
+ y = bike$count[bike$year == 2011],
+ pch = 16,
+ col = "red",
+ cex = 0.5
+ )
> # draw points for 2012 year
> points(x = bike$atemp[bike$year == 2012],
+ y = bike$count[bike$year == 2012],
+ pch = 16,
+ col = "darkgreen",
+ cex = 0.5
+ )
>
> # add regression lines for two ears
> abline(lm(count~atemp, bike, subset = year == 2011),
+ col = "darkgreen",
+ lwd = 3)
>
> abline(lm(count~atemp, bike, subset = year == 2012),
+ col = "red",
+ lwd = 3)
>
> # add legend
> legend("topleft",
+ legend = c(2011, 2012),
+ col = c("darkgreen","red"),
+ pch = c(16, 16),
+ bg = "white",
+ cex = 1
+ )
>
> # create histograms for each weather condition to explore distribution
of the bike rentals by
> # weather condition
>
> # create a vector for histograms titles
> vec <- c("Clear Weather", "Cloudy Weather", "Rainy Weather",
"Thunderstorm Weather")
>
> # parameters for plots combining
> par(mfrow = c(1, 1))
>
> # create 4 histograms with a loop
> for (i in c(1:4)){
+ name.i <- vec[i]
+ hist(bike$count[bike$weathersit == i],
+ main = name.i,
+ xlab = "Number of Rents",
+ ylab = "Frequency",
+ breaks = 10,
+ col = "yellow3",
+ border = "black")
+
+ # the line indicating median value
+ abline(v = median(bike$count[bike$weathersit == i]),
+ col = "black",
+ lwd = 3,
+ lty = 2)
+
+ # the line indicating mean value
+ abline(v = mean(bike$count[bike$weathersit == i]),
+ col = "blue",
+ lwd = 3,
+ lty = 2)
+ }
>
> # Bike rents by type of the day
> beanplot(count ~ holiday,
+ data = bike,
+ main = "Bike Rents by Type of a Day",
+ xlab = "Type of Day",
+ ylab = "Number of rents",
+ col = bean.cols,
+ lwd = 1,
+ what = c(1,1,1,0),
+ log = ""
+ )
>
> # Hourly trend of count
> boxplot(bike$count ~ bike$hr, xlab="Hour", ylab="Count of users",
main="Hourly Trend of Count")
>
> # Hourly trend of casual users
> boxplot(bike$casual ~ bike$hr, xlab="Hour", ylab="Casual users",
main="Hourly Trend of Casual users")
>
> # Hourly trend of registered users
> boxplot(bike$registered ~ bike$hr, xlab="Hour", ylab="Registered users",
main="Hourly Trend of Registered users")
>
>
> #Checking dataset
> #Tests if values in a vector are integers
> is.integer(bike)
[1] FALSE
>
> #Tests if values in a vector are NA or NULL
> #is.na(day) we tested it but due to the huge output we deleted it. There
was no "NA".
> is.null(bike)
[1] FALSE
>
> #The Dataset is already recoded and correct.
> #For this question we converted "atemp" because it was devided of 50.
> bike$raw.temp <- (bike$temp*41)
> bike$raw.atemp <-(bike$atemp * 50)
>
> #Create a new column of the mean of raw.temp and raw.atemp.
> bike$raw.mean.temp.atemp <- (bike$raw.temp + bike$raw.atemp)/2
> head(bike)
instant dteday season year month hr holiday weekday workingday
weathersit temp atemp
1 1 2011-01-01 1 2011 1 0 0 6 0
1 3.28 3.0014
2 2 2011-01-01 1 2011 1 1 0 6 0
1 2.34 1.9982
3 3 2011-01-01 1 2011 1 2 0 6 0
1 2.34 1.9982
4 4 2011-01-01 1 2011 1 3 0 6 0
1 3.28 3.0014
5 5 2011-01-01 1 2011 1 4 0 6 0
1 3.28 3.0014
6 6 2011-01-01 1 2011 1 5 0 6 0
2 3.28 1.0016
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp
1 0.81 0.0000 3 13 16 134.48 150.07
142.275
2 0.80 0.0000 8 32 40 95.94 99.91
97.925
3 0.80 0.0000 5 27 32 95.94 99.91
97.925
4 0.75 0.0000 3 10 13 134.48 150.07
142.275
5 0.75 0.0000 0 1 1 134.48 150.07
142.275
6 0.75 0.0896 0 1 1 134.48 50.08
92.280
>
> Temperature <- bike$raw.temp
>
> #Correlation between raw.temp and the total count of bike rentals.
>
> #cor.temp <- cor.test(x = as.numeric(as.factor(bike$raw.temp)), y =
as.numeric(as.factor(bike$countt)))
>
> #cor.temp
>
>
>
>
> #Correlation between atemp and the total count of bike rentals.
> #cor.atemp <- cor.test(x = bike$raw.atemp,y = bike$cnt)
> #cor.atemp
>
> Feeled.Temperature <- bike$raw.atemp
>
>
> Feeled.Raw.Temperature <- bike$raw.mean.temp.atemp
> Amount.Rentals <- bike$cnt
>
> par(mfrow=c(2,2))
>
> plot(x = Temperature, y = Amount.Rentals, main = "Correlation", col =
"red")
> abline(lm(Amount.Rentals ~ Temperature), col = "blue")
Error in model.frame.default(formula = Amount.Rentals ~ Temperature,
drop.unused.levels = TRUE) :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Temperature,
Amount.Rentals, method="kendall"), 2), sep = ""),lty = 1, col = "blue")
Error in cor(Temperature, Amount.Rentals, method = "kendall") :
supply both 'x' and 'y' or a matrix-like 'x'
>
> plot(x = Feeled.Temperature, y = Amount.Rentals, main = "Correlation",
col = "blue")
> abline(lm(Amount.Rentals ~ Feeled.Temperature), col = "red")
Error in model.frame.default(formula = Amount.Rentals ~
Feeled.Temperature, :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Feeled.Temperature,
Amount.Rentals), 2), sep = ""),lty = 1, col = "red")
Error in cor(Feeled.Temperature, Amount.Rentals) :
supply both 'x' and 'y' or a matrix-like 'x'
>
> plot(x = Feeled.Raw.Temperature, y = Amount.Rentals, main =
"Correlation", col = "green")
> abline(lm(Amount.Rentals ~ Feeled.Raw.Temperature), col = "orange")
Error in model.frame.default(formula = Amount.Rentals ~
Feeled.Raw.Temperature, :
invalid type (NULL) for variable 'Amount.Rentals'
> legend("topleft", legend = paste("cor = ", round(cor(Temperature,
Amount.Rentals), 2), sep = ""),lty = 1, col = "orange")
Error in cor(Temperature, Amount.Rentals) :
supply both 'x' and 'y' or a matrix-like 'x'
>
>
> plot(x = 1, y = 1, xlab = "Temperature", ylab = "Amount of rentals",
xlim = c(0, 40), ylim = c(0, 10000), main = "Three correlations combined")
>
> points(Feeled.Raw.Temperature, Amount.Rentals, pch = 8, col = "green")
> points(Temperature, Amount.Rentals, pch = 8, col = "red")
> points(Feeled.Temperature, Amount.Rentals, pch = 8, col = "blue")
>
>
>
> #A Two-sample-t-test for real temperature and feeled temperature.
>
> hist(bike$raw.temp, yaxt = "n", xaxt = "n", xlab = "",
+ ylab = "", main = "Two Sample t-test", xlim = c(5, 40), col =
rgb(0, 0, 1, alpha = .1))
> text(x = 13, y = 140, paste("Mean real Temp.\n",
round(mean(bike$raw.temp), 2), sep = ""), col = "blue")
> abline(v = mean(bike$raw.temp), lty = 1,
+ col = rgb(0, 0, 1, alpha = 1), lwd = 4)
>
> par(new = T)
> hist(bike$raw.atemp, yaxt = "n", xaxt = "n", xlab = "",
+ ylab = "", main = "", xlim = c(5, 40), col = rgb(1, 0, 0, alpha =
.1))
>
> abline(v = mean(bike$raw.atemp), lty = 1,
+ col = rgb(1, 0, 0, alpha = 1), lwd = 4)
> text(x= 32, y = 131, paste("Mean feeled Temp.\n",
round(mean(bike$raw.atemp), 2), sep = ""), col = "red")
>
> mtext(text = "Alternative Hypothesis is confirmed true difference in
means is not equal to 0", line = 0, side = 3)
>
>
> # two-sample t-test for real temperature and feeled temperature in
spring.
>
> temp.spring <- subset(bike, subset = season == "1")$raw.temp
> atemp.spring <- subset(bike, subset = season == "1")$raw.atemp
> test.result.spring <- t.test(x = temp.spring, y = atemp.spring,
alternative = "two.sided")
> test.result.spring
Welch Two Sample t-test

data: temp.spring and atemp.spring


t = 9.5415, df = 6985.7, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
51.38445 77.95791
sample estimates:
mean of x mean of y
248.4556 183.7844

>
> # two-sample t-test for real temperature and feeled temperature in
summer.
>
> temp.summer <- subset(bike, subset = season == "2")$raw.temp
> atemp.summer <- subset(bike, subset = season == "2")$raw.atemp
> test.result.summer <- t.test(x = temp.summer, y = atemp.summer,
alternative = "two.sided")
> test.result.summer

Welch Two Sample t-test

data: temp.summer and atemp.summer


t = -26.724, df = 7629.1, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-210.6338 -181.8441
sample estimates:
mean of x mean of y
721.5660 917.8049

>
> # two-sample t-test for real temperature and feeled temperature in fall.
>
> temp.fall <- subset(bike, subset = season == "3")$raw.temp
> atemp.fall <- subset(bike, subset = season == "3")$raw.atemp
> test.result.fall <- t.test(x = temp.fall, y = atemp.fall, alternative =
"two.sided")
> test.result.fall

Welch Two Sample t-test

data: temp.fall and atemp.fall


t = -64.182, df = 7452.5, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-341.6860 -321.4325
sample estimates:
mean of x mean of y
1033.252 1364.812

>
> # two-sample t-test for real temperature and feeled temperature in
winter.
>
> temp.winter <- subset(bike, subset = season == "4")$raw.temp
> atemp.winter <- subset(bike, subset = season == "4")$raw.atemp
> test.result.winter <- t.test(x = temp.winter, y = atemp.winter,
alternative = "two.sided")
> test.result.winter

Welch Two Sample t-test

data: temp.winter and atemp.winter


t = -12.767, df = 7280.7, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-97.52962 -71.56560
sample estimates:
mean of x mean of y
487.3869 571.9345

>
>
> # Plotting the association:
> plot(x = 1, y = 1, xlab = "Temperature in Celcius", ylab = "Bike
rentals", type = "n", main = "Association between temperature and bike
rentals",
+ xlim = c(0, 40), ylim = c(0, 90))
>
>
> #Calculating min and max for the x-axis and y-axis:
> min(bike$raw.temp)
[1] -289.46
>
> #Adding points to the plot
> day$raw.temp <- (bike$temp*41)
Error in day$raw.temp <- (bike$temp * 41) : object 'day' not found
> points(bike$raw.temp, bike$casual, pch = 16, col = "red")
> points(bike$raw.temp, bike$registered, pch = 16, col = "skyblue")
>
> # Adding a legend to the plot
> legend("topleft",legend = c("casual", "registered"), col =
c("red","skyblue"), pch = c(16, 16), bg = "white")
>
>
>
> # Calculating the correlation between raw.temp and registered users and
between raw.temp and causal users
> cor.reg <- cor.test(x = bike$raw.temp, y = bike$registered)
> cor.reg

Pearson's product-moment correlation

data: bike$raw.temp and bike$registered


t = 46.925, df = 17377, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3220992 0.3484909
sample estimates:
cor
0.3353608

>
> cor.cas <- cor.test(x = bike$raw.temp,
+ y = bike$casual)
> cor.cas

Pearson's product-moment correlation

data: bike$raw.temp and bike$casual


t = 68.22, df = 17377, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4478081 0.4712629
sample estimates:
cor
0.4596156

>
> # Adding Correlation line and the correlation value to the plot
> abline(lm(bike$registered ~ bike$raw.temp), lty = 6, col = "blue")
>
> abline(lm(bike$casual ~ bike$raw.temp), lty = 6, col = "orange")
>
> reg <- paste("cor = ", round(cor(bike$registered, bike$raw.temp), 2),
sep = "")
> cas <- paste("cor = ", round(cor(bike$casual, bike$raw.temp), 2), sep =
"")
>
> legend("left",legend = c(cas, reg) , col = c('orange', 'blue'),pch =
c(16, 16), bg = "white")
>
> #Can the number of total bike rentals be predicted by holiday and
weather?
> lookup <- data.frame("numbers"=c("1","2","3","4"),
+ "weather"=c("nice","cloudy", "wet", "lousy")
+ )
>
> bike <- merge(x= bike,
+ y= lookup,
+ by.x="weathersit",
+ by.y="numbers"
+ )
>
> head(bike)
weathersit instant dteday season year month hr holiday weekday
workingday temp atemp
1 1 1 2011-01-01 1 2011 1 0 0 6
0 3.28 3.0014
2 1 2 2011-01-01 1 2011 1 1 0 6
0 2.34 1.9982
3 1 3 2011-01-01 1 2011 1 2 0 6
0 2.34 1.9982
4 1 4 2011-01-01 1 2011 1 3 0 6
0 3.28 3.0014
5 1 5 2011-01-01 1 2011 1 4 0 6
0 3.28 3.0014
6 1 3286 2011-05-21 2 2011 5 19 0 6
0 23.96 26.0024
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp weather
1 0.81 0.0000 3 13 16 134.48 150.07
142.275 nice
2 0.80 0.0000 8 32 40 95.94 99.91
97.925 nice
3 0.80 0.0000 5 27 32 95.94 99.91
97.925 nice
4 0.75 0.0000 3 10 13 134.48 150.07
142.275 nice
5 0.75 0.0000 0 1 1 134.48 150.07
142.275 nice
6 0.47 0.1642 117 184 301 982.36 1300.12
1141.240 nice
>
> #Using linear regression
>
> total.rentals.lm <- lm(bike$count ~ holiday + weather, data = bike)
> summary (total.rentals.lm)
Call:
lm(formula = bike$count ~ holiday + weather, data = bike)

Residuals:
Min 1Q Median 3Q Max
-204.97 -141.21 -44.97 89.03 780.72

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 176.276 2.672 65.976 < 2e-16 ***
holiday -36.836 8.141 -4.525 6.08e-06 ***
weatherlousy -101.943 103.578 -0.984 0.325
weathernice 29.694 3.146 9.439 < 2e-16 ***
weatherwet -64.126 5.455 -11.755 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 179.3 on 17374 degrees of freedom


Multiple R-squared: 0.02264, Adjusted R-squared: 0.02241
F-statistic: 100.6 on 4 and 17374 DF, p-value: < 2.2e-16

>
> # Using a anova
> anv.weather <- anova (total.rentals.lm)
> anv.weather
Analysis of Variance Table

Response: bike$count
Df Sum Sq Mean Sq F value Pr(>F)
holiday 1 546889 546889 17.003 3.749e-05 ***
weather 3 12396684 4132228 128.474 < 2.2e-16 ***
Residuals 17374 558818018 32164
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> #The F-values (holiday, weather, residuals) are:
> anv.weather$`F value`
[1] 17.00312 128.47354 NA
>
> #The p-values (holiday, weather, residuals) are:
> anv.weather$`Pr(>F)`
[1] 3.749198e-05 2.588619e-82 NA
>
> #Using Tukey-Test
> weather.aov <- aov(bike$count ~ weather, data = bike)
> summary(weather.aov)
Df Sum Sq Mean Sq F value Pr(>F)
weather 3 12285030 4095010 127.2 <2e-16 ***
Residuals 17375 559476561 32200
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> TukeyHSD(weather.aov)
Tukey multiple comparisons of means
95% family-wise confidence level

Fit: aov(formula = bike$count ~ weather, data = bike)

$`weather`
diff lwr upr p adj
lousy-cloudy -100.83216 -367.10249 165.43817 0.7648878
nice-cloudy 29.70378 21.61661 37.79095 0.0000000
wet-cloudy -63.58621 -77.60667 -49.56576 0.0000000
nice-lousy 130.53594 -135.68152 396.75339 0.5886467
wet-lousy 37.24595 -229.21775 303.70965 0.9841444
wet-nice -93.28999 -106.26764 -80.31234 0.0000000

>
> # Months is coded as 1 to 12
> # Converting month with "merge"
>
> lookup.month<- data.frame("month" = c(1:12),
+ "mnth.name" = c("01Jan", "02Feb", "03March",
"04April", "05May", "06June", "07July", "08Aug", "09Sept", "10Oct",
"11Nov", "12Dec"), stringsAsFactors = FALSE)
>
> bike <- merge(x=bike, y= lookup.month, by = 'month')
>
>
> # Convert the nomalized windspeed and humidity
> bike$raw.windspeed <- (bike$windspeed*67)
> bike$raw.hum <- (bike$hum * 100)
> head(bike)
month weathersit instant dteday season year hr holiday weekday
workingday temp atemp
1 1 1 1 2011-01-01 1 2011 0 0 6
0 3.28 3.0014
2 1 1 2 2011-01-01 1 2011 1 0 6
0 2.34 1.9982
3 1 1 3 2011-01-01 1 2011 2 0 6
0 2.34 1.9982
4 1 1 4 2011-01-01 1 2011 3 0 6
0 3.28 3.0014
5 1 1 5 2011-01-01 1 2011 4 0 6
0 3.28 3.0014
6 1 2 6 2011-01-01 1 2011 5 0 6
0 3.28 1.0016
humidity windspeed casual registered count raw.temp raw.atemp
raw.mean.temp.atemp weather
1 0.81 0.0000 3 13 16 134.48 150.07
142.275 nice
2 0.80 0.0000 8 32 40 95.94 99.91
97.925 nice
3 0.80 0.0000 5 27 32 95.94 99.91
97.925 nice
4 0.75 0.0000 3 10 13 134.48 150.07
142.275 nice
5 0.75 0.0000 0 1 1 134.48 150.07
142.275 nice
6 0.75 0.0896 0 1 1 134.48 50.08
92.280 cloudy
mnth.name raw.windspeed raw.hum
1 01Jan 0.0000 81
2 01Jan 0.0000 80
3 01Jan 0.0000 80
4 01Jan 0.0000 75
5 01Jan 0.0000 75
6 01Jan 6.0032 75
>
> #descreptive statistics with the dplyr-functions:
> require(dplyr)
> month.agg <- bike %>% group_by(mnth.name) %>% summarise(
+ mean.temp = mean(raw.temp),
+ mean.hum = mean(raw.hum),
+ mean.windspeed = mean(raw.windspeed),
+ mean.rentals = mean(bike$count))
>
> month.agg
# A tibble: 12 x 5
mnth.name mean.temp mean.hum mean.windspeed mean.rentals
<chr> <dbl> <dbl> <dbl> <dbl>
1 01Jan 130. 58.1 13.9 189.
2 02Feb 251. 56.7 14.5 189.
3 03March 425. 58.9 14.9 189.
4 04April 578. 58.8 15.7 189.
5 05May 818. 68.9 12.3 189.
6 06June 990. 57.6 12.4 189.
7 07July 1128. 59.8 11.1 189.
8 08Aug 1038. 63.7 11.5 189.
9 09Sept 860. 71.4 11.1 189.
10 10Oct 611. 68.9 11.5 189.
11 11Nov 383. 62.5 12.3 189.
12 12Dec 297. 66.6 11.8 189.
>
> par(mfrow=c(2,2))
> barplot(height = month.agg$mean.rentals,
+ names.arg = month.agg$mnth.name ,col = "red", main = "Mean
rentals" )
>
> barplot(height = month.agg$mean.windspeed,
+ names.arg = month.agg$mnth.name,col = "blue", main = "Mean
Windspeed (km/h)" )
>
> barplot(height = month.agg$mean.hum,
+ names.arg = month.agg$mnth.name,col = "green", main = "Mean
Humidity" )
>
>
> barplot(height = month.agg$mean.temp,
+ names.arg = month.agg$mnth.name,col = "skyblue", main = "Mean
Temperature" )
>
>
>
> # Compute correlations
>
sub=data.frame(bike$registered,bike$casual,bike$count,bike$temp,bike$humid
ity,bike$atemp,bike$windspeed)
> cor(sub)
bike.registered bike.casual bike.count bike.temp
bike.humidity bike.atemp
bike.registered 1.00000000 0.50661770 0.97215073 0.33536085 -
0.27393312 0.33255864
bike.casual 0.50661770 1.00000000 0.69456408 0.45961565 -
0.34702809 0.45408007
bike.count 0.97215073 0.69456408 1.00000000 0.40477228 -
0.32291074 0.40092930
bike.temp 0.33536085 0.45961565 0.40477228 1.00000000 -
0.06988139 0.98767214
bike.humidity -0.27393312 -0.34702809 -0.32291074 -0.06988139
1.00000000 -0.05191770
bike.atemp 0.33255864 0.45408007 0.40092930 0.98767214 -
0.05191770 1.00000000
bike.windspeed 0.08232085 0.09028678 0.09323378 -0.02312526 -
0.29010490 -0.06233604
bike.windspeed
bike.registered 0.08232085
bike.casual 0.09028678
bike.count 0.09323378
bike.temp -0.02312526
bike.humidity -0.29010490
bike.atemp -0.06233604
bike.windspeed 1.00000000
>
> # hour buckets for registered users
> d1 = rpart(registered~hr,data=bike)
> fancyRpartPlot(d1)
>
> # hour buckets for casual users
> d2 = rpart(casual~hr,data=bike)
> fancyRpartPlot(d2)
>
> # create bis for regular and causal hours based on rpart output
> bike$dp_reg=0
> bike$dp_reg[bike$hr<8]=1
> bike$dp_reg[bike$hr>=22]=2
> bike$dp_reg[bike$hr>9 & bike$hr<18]=3
> bike$dp_reg[bike$hr==8]=4
> bike$dp_reg[bike$hr==9]=5
> bike$dp_reg[bike$hr==20 | bike$hr==21]=6
> bike$dp_reg[bike$hr==19 | bike$hr==18]=7
>
> bike$dp_cas=0
> bike$dp_cas[bike$hour<=8]=1
> bike$dp_cas[bike$hour==9]=2
> bike$dp_cas[bike$hour>=10 & bike$hour<=19]=3
> bike$dp_cas[bike$hour>19]=4
>
> # temp buckets for registered users
> f1=rpart(registered~temp,data=bike)
> fancyRpartPlot(f1)
>
> # temp buckets for casual users
> f2=rpart(casual~temp,data=bike)
> fancyRpartPlot(f2)
>
> # create buckets for temp registered users
> bike$temp_reg=0
> bike$temp_reg[bike$temp<13]=1
> bike$temp_reg[bike$temp>=13 & bike$temp<23]=2
> bike$temp_reg[bike$temp>=23 & bike$temp<30]=3
> bike$temp_reg[bike$temp>=30]=4
>
> # create buckets for temp casual users
> bike$temp_cas=0
> bike$temp_cas[bike$temp<15]=1
> bike$temp_cas[bike$temp>=15 & bike$temp<23]=2
> bike$temp_cas[bike$temp>=23 & bike$temp<30]=3
> bike$temp_cas[bike$temp>=30]=4
>
> # create paritions for year
> bike$year_part[bike$year=='2011']=1
> bike$year_part[bike$year=='2011' & bike$month>3]=2
> bike$year_part[bike$year=='2011' & bike$month>6]=3
> bike$year_part[bike$year=='2011' & bike$month>9]=4
> bike$year_part[bike$year=='2012']=5
> bike$year_part[bike$year=='2012' & bike$month>3]=6
> bike$year_part[bike$year=='2012' & bike$month>6]=7
> bike$year_part[bike$year=='2012' & bike$month>9]=8
> table(bike$year_part)

1 2 3 4 5 6 7 8
2067 2183 2192 2203 2176 2182 2208 2168
>
> # creating another variable day_type which may affect our accuracy as
weekends
> # and weekdays are important in deciding rentals
> bike$day_type=0
> bike$day_type[bike$holiday==0 & bike$workingday==0]="weekend"
> bike$day_type[bike$holiday==1]="holiday"
> bike$day_type[bike$holiday==0 & bike$workingday==1]="working day"
>
> str(bike)
'data.frame': 17379 obs. of 30 variables:
$ month : int 1 1 1 1 1 1 1 1 1 1 ...
$ weathersit : int 1 1 1 1 1 2 1 1 1 1 ...
$ instant : int 1 2 3 4 5 6 7 8 9 10 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..:
1 1 1 1 1 1 1 1 1 1 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ year : num 2011 2011 2011 2011 2011 ...
$ hr : int 0 1 2 3 4 5 6 7 8 9 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 6 6 6 6 6 6 ...
$ workingday : int 0 0 0 0 0 0 0 0 0 0 ...
$ temp : num 3.28 2.34 2.34 3.28 3.28 3.28 2.34 1.4 3.28
7.04 ...
$ atemp : num 3 2 2 3 3 ...
$ humidity : num 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75
0.76 ...
$ windspeed : num 0 0 0 0 0 0.0896 0 0 0 0 ...
$ casual : int 3 8 5 3 0 0 2 1 1 8 ...
$ registered : int 13 32 27 10 1 1 0 2 7 6 ...
$ count : int 16 40 32 13 1 1 2 3 8 14 ...
$ raw.temp : num 134.5 95.9 95.9 134.5 134.5 ...
$ raw.atemp : num 150.1 99.9 99.9 150.1 150.1 ...
$ raw.mean.temp.atemp: num 142.3 97.9 97.9 142.3 142.3 ...
$ weather : Factor w/ 4 levels "cloudy","lousy",..: 3 3 3 3 3
1 3 3 3 3 ...
$ mnth.name : chr "01Jan" "01Jan" "01Jan" "01Jan" ...
$ raw.windspeed : num 0 0 0 0 0 ...
$ raw.hum : num 81 80 80 75 75 75 80 86 75 76 ...
$ dp_reg : num 1 1 1 1 1 1 1 1 4 5 ...
$ dp_cas : num 0 0 0 0 0 0 0 0 0 0 ...
$ temp_reg : num 1 1 1 1 1 1 1 1 1 1 ...
$ temp_cas : num 1 1 1 1 1 1 1 1 1 1 ...
$ year_part : num 1 1 1 1 1 1 1 1 1 1 ...
$ day_type : chr "weekend" "weekend" "weekend" "weekend" ...
>
> # dividing total data depending on windspeed to impute/predict the
missing values
> table(bike$windspeed==0)

FALSE TRUE
15199 2180
> k=bike$windspeed==0
> wind_0=subset(bike,k)
> wind_1=subset(bike,!k)
>
> # predicting missing values in windspeed using a random forest model
> # this is a different approach to impute missing values rather than just
using the mean or median or some other statistic for imputation
> set.seed(415)
> str(wind_1)
'data.frame': 15199 obs. of 30 variables:
$ month : int 1 1 1 1 1 1 1 1 1 1 ...
$ weathersit : int 2 1 1 1 1 3 1 1 1 2 ...
$ instant : int 6 11 12 13 8760 9128 8762 8763 8764 632 ...
$ dteday : Factor w/ 731 levels "2011-01-01","2011-01-02",..:
1 1 1 1 370 386 370 370 370 29 ...
$ season : int 1 1 1 1 1 1 1 1 1 1 ...
$ year : num 2011 2011 2011 2011 2012 ...
$ hr : int 5 10 11 12 19 5 21 22 23 14 ...
$ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
$ weekday : int 6 6 6 6 4 6 4 4 4 6 ...
$ workingday : int 0 0 0 0 1 0 1 1 1 0 ...
$ temp : num 3.28 9.86 8.92 11.74 7.98 ...
$ atemp : num 1 10 6 12 7 ...
$ humidity : num 0.75 0.76 0.81 0.77 0.34 0.86 0.6 0.6 0.7 0.6
...
$ windspeed : num 0.0896 0.2537 0.2836 0.2836 0.0896 ...
$ casual : int 0 12 26 29 10 0 9 1 4 10 ...
$ registered : int 1 24 30 55 255 2 88 70 56 89 ...
$ count : int 1 36 56 84 265 2 97 71 60 99 ...
$ raw.temp : num 134 404 366 481 327 ...
$ raw.atemp : num 50.1 499.9 299.9 599.9 350 ...
$ raw.mean.temp.atemp: num 92.3 452.1 332.8 540.6 338.6 ...
$ weather : Factor w/ 4 levels "cloudy","lousy",..: 1 3 3 3 3
4 3 3 3 1 ...
$ mnth.name : chr "01Jan" "01Jan" "01Jan" "01Jan" ...
$ raw.windspeed : num 6 17 19 19 6 ...
$ raw.hum : num 75 76 81 77 34 86 60 60 70 60 ...
$ dp_reg : num 1 3 3 3 7 1 6 2 2 3 ...
$ dp_cas : num 0 0 0 0 0 0 0 0 0 0 ...
$ temp_reg : num 1 1 1 1 1 1 1 1 1 1 ...
$ temp_cas : num 1 1 1 1 1 1 1 1 1 1 ...
$ year_part : num 1 1 1 1 5 5 5 5 5 1 ...
$ day_type : chr "weekend" "weekend" "weekend" "weekend" ...
> wind_1$season <- as.factor(wind_1$season)
> wind_0$season <- as.factor(wind_0$season)
> fit <- randomForest(windspeed ~
season+weathersit+humidity+month+temp+year+atemp,
data=wind_1,importance=TRUE, ntree=250)
> pred=predict(fit,wind_0)
> wind_0$windspeed=pred
>
> bike=rbind(wind_0,wind_1)
> bike$weekend=0
> bike$weekend[bike$day=="Sunday" | bike$day=="Saturday"]=1
> names(bike)
[1] "month" "weathersit" "instant"
"dteday"
[5] "season" "year" "hr"
"holiday"
[9] "weekday" "workingday" "temp"
"atemp"
[13] "humidity" "windspeed" "casual"
"registered"
[17] "count" "raw.temp" "raw.atemp"
"raw.mean.temp.atemp"
[21] "weather" "mnth.name" "raw.windspeed"
"raw.hum"
[25] "dp_reg" "dp_cas" "temp_reg"
"temp_cas"
[29] "year_part" "day_type" "weekend"
> bike <- select(bike, -c(weekend))

You might also like