8000 Grayscale ALL the images · codingbooks/tidy-text-mining@7854b0a · GitHub
[go: up one dir, main page]

Skip to content

Commit 7854b0a

Browse files
committed
Grayscale ALL the images
1 parent 5ab8b09 commit 7854b0a

7 files changed

+825
-55
lines changed

01-tidy-text.Rmd

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,12 @@ Now let's plot (Figure \@ref(fig:plotcompare)).
239239
library(scales)
240240
241241
# expect a warning about rows with missing values being removed
242-
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
243-
geom_abline(color = "gray40", lty = 2) +
244-
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
242+
ggplot(frequency, aes(x = proportion, y = `Jane Austen`)) +
243+
geom_abline(color = "gray50", lty = 2) +
244+
geom_jitter(alpha = 0.05, size = 2.5, width = 0.3, height = 0.3) +
245245
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
246246
scale_x_log10(labels = percent_format()) +
247247
scale_y_log10(labels = percent_format()) +
248-
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
249248
facet_wrap(~author, ncol = 2) +
250249
theme(legend.position="none") +
251250
labs(y = "Jane Austen", x = NULL)

02-sentiment-analysis.Rmd

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ Now we can plot these sentiment scores across the plot trajectory of each novel.
113113
```{r sentimentplot, dependson = "janeaustensentiment", fig.width=9, fig.height=10, fig.cap="Sentiment through the narratives of Jane Austen's novels"}
114114
library(ggplot2)
115115
116-
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
117-
geom_col(show.legend = FALSE) +
116+
ggplot(janeaustensentiment, aes(index, sentiment)) +
117+
geom_col() +
118118
facet_wrap(~book, ncol = 2, scales = "free_x")
119119
```
120120

@@ -166,8 +166,8 @@ We now have an estimate of the net sentiment (positive - negative) in each chunk
166166
```{r compareplot, dependson = "comparesentiment", fig.width=9, fig.height=7.5, fig.cap="(ref:comparecap)"}
167167
bind_rows(afinn,
168168
bing_and_nrc) %>%
169-
ggplot(aes(index, sentiment, fill = method)) +
170-
geom_col(show.legend = FALSE) +
169+
ggplot(aes(index, sentiment)) +
170+
geom_col() +
171171
facet_wrap(~method, ncol = 1, scales = "free_y")
172172
```
173173

@@ -209,8 +209,8 @@ bing_word_counts %>%
209209
top_n(10) %>%
210210
ungroup() %>%
211211
mutate(word = reorder(word, n)) %>%
212-
ggplot(aes(word, n, fill = sentiment)) +
213-
geom_col(show.legend = FALSE) +
212+
ggplot(aes(word, n)) +
213+
geom_col() +
214214
facet_wrap(~sentiment, scales = "free_y") +
215215
labs(y = "Contribution to sentiment",
216216
x = NULL) +
@@ -252,7 +252,7 @@ tidy_books %>%
252252
inner_join(get_sentiments("bing")) %>%
253253
count(word, sentiment, sort = TRUE) %>%
254254
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
255-
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
255+
comparison.cloud(colors = c("gray10", "gray80"),
256256
max.words = 100)
257257
```
258258

03-tf-idf.Rmd

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ There is one row in this `book_words` data frame for each word-book combination;
5050
```{r plottf, dependson = "book_words", fig.height=9, fig.width=9, fig.cap="Term Frequency Distribution in Jane Austen's Novels"}
5151
library(ggplot2)
5252
53-
ggplot(book_words, aes(n/total, fill = book)) +
54-
geom_histogram(show.legend = FALSE) +
53+
ggplot(book_words, aes(n/total)) +
54+
geom_histogram() +
5555
xlim(NA, 0.0009) +
5656
facet_wrap(~book, ncol = 2, scales = "free_y")
5757
```
@@ -79,9 +79,9 @@ freq_by_rank
7979

8080
The `rank` column here tells us the rank of each word within the frequency table; the table was already ordered by `n` so we could use `row_number()` to find the rank. Then, we can calculate the term frequency in the same way we did before. Zipf's law is often visualized by plotting rank on the x-axis and term frequency on the y-axis, on logarithmic scales. Plotting this way, an inversely proportional relationship will have a constant, negative slope.
8181

82-
```{r zipf, dependson = "freq_by_rank", fig.width=7, fig.height=5, fig.cap="Zipf's law for Jane Austen's novels"}
82+
```{r zipf, dependson = "freq_by_rank", fig.width=6, fig.height=5, fig.cap="Zipf's law for Jane Austen's novels"}
8383
freq_by_rank %>%
84-
ggplot(aes(rank, `term frequency`, color = book)) +
84+
ggplot(aes(rank, `term frequency`, group = book)) +
8585
geom_line(size = 1.2, alpha = 0.8) +
8686
scale_x_log10() +
8787
scale_y_log10()
@@ -102,9 +102,9 @@ Classic versions of Zipf's law have
102102
$$\text{frequency} \propto \frac{1}{\text{rank}}$$
103103
and we have in fact gotten a slope close to -1 here. Let's plot this fitted power law with the data in Figure \@ref(fig:zipffit) to see how it looks.
104104

105-
```{r zipffit, dependson = "freq_by_rank", fig.width=7, fig.height=5, fig.cap="Fitting an exponent for Zipf's law with Jane Austen's novels"}
105+
```{r zipffit, dependson = "freq_by_rank", fig.width=6, fig.height=5, fig.cap="Fitting an exponent for Zipf's law with Jane Austen's novels"}
106106
freq_by_rank %>%
107-
ggplot(aes(rank, `term frequency`, color = book)) +
107+
ggplot(aes(rank, `term frequency`, group = book)) +
108108
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
109109
geom_line(size = 1.2, alpha = 0.8) +
110110
scale_x_log10() +
@@ -148,7 +148,9 @@ plot_austen <- book_words %>%
148148
arrange(desc(tf_idf)) %>%
149149
mutate(word = factor(word, levels = rev(unique(word))))
150150
151-
ggplot(plot_austen[1:20,], aes(word, tf_idf, fill = book)) +
151+
plot_austen %>%
152+
top_n(20) %>%
153+
ggplot(aes(word, tf_idf)) +
152154
geom_col() +
153155
labs(x = NULL, y = "tf-idf") +
154156
coord_flip()
@@ -157,13 +159,12 @@ ggplot(plot_austen[1:20,], aes(word, tf_idf, fill = book)) +
157159
Let's look at the novels individually.
158160

159161
```{r plotseparate, dependson = "plot_austen", fig.height=10, fig.width=9, fig.cap="Highest tf-idf words in each of Jane Austen's Novels"}
160-
plot_austen <- plot_austen %>%
162+
plot_austen %>%
161163
group_by(book) %>%
162164
top_n(15) %>%
163-
ungroup
164-
165-
ggplot(plot_austen, aes(word, tf_idf, fill = book)) +
166-
geom_col(show.legend = FALSE) +
165+
ungroup %>%
166+
ggplot(aes(word, tf_idf)) +
167+
geom_col() +
167168
labs(x = NULL, y = "tf-idf") +
168169
facet_wrap(~book, ncol = 2, scales = "free") +
169170
coord_flip()
@@ -212,7 +213,9 @@ plot_physics <- physics_words %>%
212213
"Tesla, Nikola",
213214
"Einstein, Albert")))
214215
215-
ggplot(plot_physics[1:20,], aes(word, tf_idf, fill = author)) +
216+
plot_physics %>%
217+
top_n(20) %>%
218+
ggplot(aes(word, tf_idf)) +
216219
geom_col() +
217220
labs(x = NULL, y = "tf-idf") +
218221
coord_flip()
@@ -221,13 +224,13 @@ ggplot(plot_physics[1:20,], aes(word, tf_idf, fill = author)) +
221224
Nice! Let's look at each text individually in Figure \@ref(fig:physicsseparate).
222225

223226
```{r physicsseparate, dependson = "plot_physics", fig.height=7, fig.width=8, fig.cap="Highest tf-idf words in each physics texts"}
224-
plot_physics <- plot_physics %>%
227+
plot_physics %>%
225228
group_by(author) %>%
226229
top_n(15, tf_idf) %>%
227-
mutate(word = reorder(word, tf_idf))
228-
229-
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
230-
geom_col(show.legend = FALSE) +
230+
ungroup() %>%
231+
mutate(word = reorder(word, tf_idf)) %>%
232+
ggplot(aes(word, tf_idf)) +
233+
geom_col() +
231234
labs(x = NULL, y = "tf-idf") +
232235
facet_wrap(~author, ncol = 2, scales = "free") +
233236
coord_flip()
@@ -278,8 +281,8 @@ plot_physics <- physics_words %>%
278281
"Tesla, Nikola",
279282
"Einstein, Albert")))
280283
281-
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
282-
geom_col(show.legend = FALSE) +
284+
ggplot(plot_physics, aes(word, tf_idf)) +
285+
geom_col() +
283286
labs(x = NULL, y = "tf-idf") +
284287
facet_wrap(~author, ncol = 2, scales = "free") +
285288
coord_flip()

0 commit comments

Comments
 (0)
0