#install these packages if necessary library(tidyverse) #set of packages for data manipulation and visualization (in particular, the visualization package ggplo2) library(tidytext) #package for convenient ("tidy") text mining in R library(gutenbergr) #package for easy downloading books from Gutenberg Project collection metadata_shakespeare = gutenberg_works(author == "Shakespeare, William") shakespeare_gutenberg = gutenberg_download(c(metadata_shakespeare$gutenberg_id)) metadata_bronte = gutenberg_works(author == "Brontė, Charlotte") bronte_gutenberg = gutenberg_download(c(metadata_bronte$gutenberg_id)) bigrams01 = shakespeare_gutenberg %>% unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE) bigrams_separated01 = bigrams01 %>% separate(bigram, c("word1", "word2"), sep=" ") he_she_words01 = bigrams_separated01 %>% filter(word1 %in% c("he", "she")) #Finding the ratio of words paired with 'he' or 'she' he_she_counts_shakespeare = he_she_words01 %>% count(word1, word2) %>% spread(word1, n, fill = 0) %>% mutate(total = he + she, he = (he + 1) / sum(he + 1), she = (she + 1) / sum(she + 1), log_ratio = log2(she / he), abs_ratio = abs(log_ratio)) %>% arrange(desc(log_ratio)) #Filtering out several unnecessary words (to avoid redundant bigrams like "she herself") he_she_counts_shakespeare = he_she_counts_shakespeare %>% filter(!word2 %in% c("himself", "herself", "she", "he"), total>= 50) #Preparing the data for plotting he_she_counts_shakespeare = he_she_counts_shakespeare %>% mutate(abslogratio = abs(log_ratio)) %>% group_by(log_ratio < 0) %>% top_n(20, abslogratio) %>% #choosing the number of words to display ungroup() %>% mutate(word = reorder(word2, log_ratio)) #Plotting plot_shakespeare = ggplot(he_she_counts_shakespeare, aes(word, log_ratio, color=log_ratio < 0)) + geom_segment(aes(x=word, xend=word, y=0, yend=log_ratio), size=1.1, alpha=0.6) + geom_point(size=3.5) + coord_flip() + labs(x = NULL, y = "Relative appearance after 'she' compared to 'he'", title = "Words paired with he/she", subtitle = "William Shakespeare's works") + scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) + scale_y_continuous(breaks = seq(-3, 3), labels = c("0.125x", "0.25x", "0.5x", "Same", "2x", "4x", "8x")) plot_shakespeare bigrams02 = bronte_gutenberg %>% unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE) bigrams_separated02 = bigrams02 %>% separate(bigram, c("word1", "word2"), sep=" ") he_she_words02 = bigrams_separated02 %>% filter(word1 %in% c("he", "she")) #Finding the ratio of words paired with 'he' or 'she' he_she_counts_bronte = he_she_words02 %>% count(word1, word2) %>% spread(word1, n, fill = 0) %>% mutate(total = he + she, he = (he + 1) / sum(he + 1), she = (she + 1) / sum(she + 1), log_ratio = log2(she / he), abs_ratio = abs(log_ratio)) %>% arrange(desc(log_ratio)) #Filtering out several unnecessary words (to avoid redundant bigrams like "she herself") he_she_counts_bronte = he_she_counts_bronte %>% filter(!word2 %in% c("himself", "herself", "she", "he"), total>= 50) #Preparing the data for plotting he_she_counts_bronte = he_she_counts_bronte %>% mutate(abslogratio = abs(log_ratio)) %>% group_by(log_ratio < 0) %>% top_n(20, abslogratio) %>% #choosing the number of words to display ungroup() %>% mutate(word = reorder(word2, log_ratio)) #Plotting plot_bronte = ggplot(he_she_counts_bronte, aes(word, log_ratio, color=log_ratio < 0)) + geom_segment(aes(x=word, xend=word, y=0, yend=log_ratio), size=1.1, alpha=0.6) + geom_point(size=3.5) + coord_flip() + labs(x = NULL, y = "Relative appearance after 'she' compared to 'he'", title = "Words paired with he/she", subtitle = "Charlotte Bronte's novels") + scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) + scale_y_continuous(breaks = seq(-3, 3), labels = c("0.125x", "0.25x", "0.5x", "Same", "2x", "4x", "8x")) plot_bronte