codingbooks
diff --git a/‎03-tf-idf.Rmd
Lines changed: 1 addition & 1 deletion b/‎03-tf-idf.Rmd
Lines changed: 1 addition & 1 deletion
diff --git a/‎07-tweet-archives.Rmd
Lines changed: 4 additions & 4 deletions b/‎07-tweet-archives.Rmd
Lines changed: 4 additions & 4 deletions
diff --git a/‎08-nasa-metadata.Rmd
Lines changed: 16 additions & 12 deletions b/‎08-nasa-metadata.Rmd
Lines changed: 16 additions & 12 deletions
@@ -251,7 +251,7 @@ physics %>%
   select(text)
 ```
 
-Maybe it makes sense to keep this one. Also notice that in this line we have "co-ordinate", which explains why there are separate "co" and "ordinate" items in the high tf-idf words for the Einstein text; the `unnest_tokens()` function separates around punctuation.
+Maybe it makes sense to keep this one. Also notice that in this line we have "co-ordinate", which explains why there are separate "co" and "ordinate" items in the high tf-idf words for the Einstein text; the `unnest_tokens()` function separates around punctuation. Notice that the tf-idf scores for "co" and "ordinate" are close to same!
 
 "AB", "RC", and so forth are names of rays, circles, angles, and so forth for Huygens.
 
 
@@ -14,7 +14,7 @@ One type of text that gets plenty of attention is text shared online via Twitter
 
 An individual can download their own Twitter archive by following [directions available on Twitter's website](https://support.twitter.com/articles/20170160). We each downloaded ours and will now open them up. Let's use the lubridate package to convert the string timestamps to date-time objects and initially take a look at our tweeting patterns overall (Figure \@ref(fig:setup)).
 
-```{r setup, fig.width=7, fig.height=6, fig.cap="All tweets from our accounts"}
+```{r setup, fig.width=7, fig.height=7, fig.cap="All tweets from our accounts"}
 library(lubridate)
 library(ggplot2)
 library(dplyr)
@@ -129,8 +129,7 @@ word_ratios <- tidy_tweets %>%
   count(word, person) %>%
   filter(sum(n) >= 10) %>%
   spread(person, n, fill = 0) %>%
-  ungroup() %>%
-  mutate_each(funs((. + 1) / sum(. + 1)), -word) %>%
+  mutate_if(is.numeric, funs((. + 1) / sum(. + 1))) %>%
   mutate(logratio = log(David / Julia)) %>%
   arrange(desc(logratio))
 ```
@@ -148,8 +147,9 @@ Which words are most likely to be from Julia's account or from David's account?
 
 ```{r plotratios, dependson = "word_ratios", fig.width=7, fig.height=6, fig.cap="Comparing the odds ratios of words from our accounts"}
 word_ratios %>%
+  mutate(abslogratio = abs(logratio)) %>%
   group_by(logratio < 0) %>%
-  top_n(15, abs(logratio)) %>%
+  top_n(15, abslogratio) %>%
   ungroup() %>%
   mutate(word = reorder(word, logratio)) %>%
   ggplot(aes(word, logratio, fill = logratio < 0)) +
 
@@ -195,9 +195,10 @@ title_word_pairs %>%
   filter(n >= 250) %>%
   graph_from_data_frame() %>%
   ggraph(layout = "fr") +
-  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
-  geom_node_point(color = "darkslategray4", size = 5) +
-  geom_node_text(aes(label = name), repel = TRUE) +
+  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
+  geom_node_point(size = 5) +
+  geom_node_text(aes(label = name), repel = TRUE, 
+                 point.padding = unit(0.2, "lines")) +
   theme_void()
 ```
 
@@ -211,9 +212,10 @@ desc_word_pairs %>%
   filter(n >= 5000) %>%
   graph_from_data_frame() %>%
   ggraph(layout = "fr") +
-  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
-  geom_node_point(color = "indianred4", size = 5) +
-  geom_node_text(aes(label = name), repel = TRUE) +
+  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
+  geom_node_point(size = 5) +
+  geom_node_text(aes(label = name), repel = TRUE,
+                 point.padding = unit(0.2, "lines")) +
   theme_void()
 
 ```
@@ -235,9 +237,10 @@ keyword_pairs %>%
   filter(n >= 700) %>%
   graph_from_data_frame() %>%
   ggraph(layout = "fr") +
-  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
-  geom_node_point(color = "royalblue3", size = 5) +
-  geom_node_text(aes(label = name), repel = TRUE) +
+  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "royalblue") +
+  geom_node_point(size = 5) +
+  geom_node_text(aes(label = name), repel = TRUE,
+                 point.padding = unit(0.2, "lines")) +
   theme_void()
 ```
 
@@ -268,9 +271,10 @@ keyword_cors %>%
   filter(correlation > .6) %>%
   graph_from_data_frame() %>%
   ggraph(layout = "fr") +
-  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation)) +
-  geom_node_point(color = "royalblue3", size = 5) +
-  geom_node_text(aes(label = name), repel = TRUE) +
+  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
+  geom_node_point(size = 5) +
+  geom_node_text(aes(label = name), repel = TRUE,
+                 point.padding = unit(0.2, "lines")) +
   theme_void()
 ```