This cheat sheet provides a comprehensive overview of key concepts and commands in Statistics and Data Science, including basic statistics, data manipulation, model fitting, simulation, and visualizations. It includes syntax for R programming, such as calculating means, creating frequency tables, and generating plots. The document serves as a quick reference for performing statistical analyses and visualizations using R.
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0 ratings0% found this document useful (0 votes)
15 views3 pages
r-cheatsheet-ABC (1)
This cheat sheet provides a comprehensive overview of key concepts and commands in Statistics and Data Science, including basic statistics, data manipulation, model fitting, simulation, and visualizations. It includes syntax for R programming, such as calculating means, creating frequency tables, and generating plots. The document serves as a quick reference for performing statistical analyses and visualizations using R.
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3
Statistics and Data Science I (ABC) CHEAT SHEET
Word Equations Summary Tables Simple Statistics
outcome = explanatory + other stuff # compute five-number summary mean(data_set$Y) Y = X + other stuff favstats(~ Y, data = data_set) var(data_set$Y) sd(data_set$Y) # create frequency table tally(data_set$Y) cohensD(Y ~ X, data = data_set) Basics tally(~ Y, data = data_set) cor(Y ~ X, data = data_set) print("Hello world!") # tally by condition b1(Y ~ X, data = data_set) # assign value to object tally(~ Y < 1900, data = data_set) b1(one_model) myNumber <- 5 # two-way frequency table pre(Y ~ X, data = data_set) # combine values into vector tally(Y ~ X, data = data_set, margin = TRUE, f(Y ~ X, data = data_set) myVector <- c(1, 2, 3) format = “proportion”)
# first element in vector Data Frame
myVector[1] # structure of data frame # arrange rows by variable # orders values or cases str(data_set) arrange(data_set, Y) sort(myVector) # view first/last six rows # creates data frame from csv file # arithmetic operations head(data_set) data_set <- read.csv("file_name", header = TRUE) sum(1, 2, 100), +, -, *, / tail(data_set) sqrt(157) # convert quantitative variable abs(data_set$Y) # select multiple variables # to categorical select(data_set, Y1, Y2) factor(data_set$Y) # logical operations factor(data_set$Y, levels = c(1,2), labels = >, <, >=, <=, ==, !=, |, & # first six rows of selected variables c("A", "B")) head(select(data_set, Y1, Y2)) # results in a variable with values # transform values # of TRUE or FALSE recode(data_set$Y, "0" = 0, "1" = 50, "2" = 100) data_set$C <- data_set$A > data_set$B # select variable (a column) data_set$Y # creates two equal sized groups Probability Distribution ntile(data_set$Y, 2) # calculate the probability area # find rows that meet condition # convert categorical variable xpnorm(65.1, data_set$mean, data_set$sd) data_set[data_set$Y > 40] # to quantitative filter(data_set, Y > 300) as.numeric(data_set$Y) zscore(data_set$Y) filter(data_set, Y != "NA")
# returns t at this probability
qt(.975, df = 999) # returns F at this probability qf(.95, df1 = 1, df2 = 100)
# CI using t distribution confint(empty_model)
# calculate p-value using F-distribution
xpf(sample_F, df1 = 2 , df2 = 10)
Page: 1 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org
Statistics and Data Science I (ABC) CHEAT SHEET Simulation Fitting and Evaluating Models # sample without replacement # bootstrap sampling distribution of b1s, # empty model sample(data_set, 6) # centered on sample b1 empty_model <- lm(Y ~ NULL, sdob1_boot <- do(1000) * data = data_set) # sample with replacement b1(Y ~ X, data = resample(data_set)) resample(data_set, 10) # use one expanatory variable # count the number of b1s at the upper one_model <- lm(Y ~ X, data = data_set) do(3) * resample (data_set, 10) # and lower extreme tally(sdob1$b1 > sample_b1 | # create a function from a formula # mixes up values in a variable sdob1$b1 < -sample_b1) one_model_fun <- makeFun(one_model) shuffle(data_set$Y) one_model_fun(x_level_1) # simulate sampling 10000 Ys # return TRUE for middle 95% of distribution # from normal distribution middle(sdob1$b1, .95) # model predictions and residuals sim_Y <- rnorm(10000, Y_stats$mean, data_set$empty_predict <- predict(empty_model) Y_stats$sd) # randomize sampling distribution of PREs data_set$empty_resid <- resid(empty_model) sdoPRE <- do(1000) * PRE(shuffle(Y) ~ X, # put simulated Ys into dataframe data = data_set) # produce ANOVA table data_set<- data.frame(sim_Y) anova(empty_model) # randomize sampling distribution of Fs supernova(one_model) # simulate sampling distribution of sdoF <- do(1000) * means fVal(shuffle(Y) ~ X, data = data_set) # t-test, using pooled variance sim_SDoM <- do(10000) * mean(rnorm(157, t.test(Tip ~ Condition, data = data_set, Y_stats$mean, Y_stats$sd)) # counts extreme Fs var.equal=TRUE) tally(~fVal > sample_F, data = sdoF) # bootstrap sampling distribution of # pairwise comparison means # corrections: "Bonferroni" or "none" bootSDoM <- do(10000) * pairwise(one_model, correction = "none") mean(resample(data_set$Y, 157))
# randomize sampling distribution
# of b1s, centered on 0 sdob1 <- do(1000) * b1(shuffle(Y) ~ X, data = data_set)
Page: 2 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org
Statistics and Data Science I (ABC) CHEAT SHEET Visualizations gf_boxplot(Y ~ X, data = data_set) # sampling distribution of b1 gf_histogram(~ Y, data = data_set) %>% gf_histogram(~b1, data = sdob1, # change labels fill = ~middle(b1, .95)) %>% gf_labs(title = "Graph Title", x = "Y_Name", # modify the limits on x- and y-axes y = "Frequency") gf_lims(x = c(-12, 12), y = c(0, 70))
gf_point(Y ~ X, data = data_set)
# faceted grid of histograms
gf_histogram(~ Y, data = data_set) %>% gf_facet_grid(X ~ .)
gf_point(Y ~ X, data = data_set) %>%
# add model predictions as red points gf_point(Y ~ X , shape = 1, size = 3, color = "firebrick") %>% gf_jitter(Y ~ X, data = data_set) # add best fitting model as a red line gf_model(one_model, color = “red”)