#install these packages if necessary
library(tidyverse) #set of packages for data manipulation and visualization (in particular, the visualization package ggplo2)
library(tidytext) #package for convenient ("tidy") text mining in R
library(gutenbergr) #package for easy downloading books from Gutenberg Project collection

metadata_shakespeare = gutenberg_works(author == "Shakespeare, William")
shakespeare_gutenberg = gutenberg_download(c(metadata_shakespeare$gutenberg_id))
metadata_bronte = gutenberg_works(author == "Brontë, Charlotte")
bronte_gutenberg = gutenberg_download(c(metadata_bronte$gutenberg_id))

bigrams01 = shakespeare_gutenberg %>%
  unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE)
bigrams_separated01 = bigrams01 %>%
  separate(bigram, c("word1", "word2"), sep=" ")
he_she_words01 = bigrams_separated01 %>%
  filter(word1 %in% c("he", "she"))

#Finding the ratio of words paired with 'he' or 'she'
he_she_counts_shakespeare = he_she_words01 %>%
  count(word1, word2) %>%
  spread(word1, n, fill = 0) %>%
  mutate(total = he + she,
         he = (he + 1) / sum(he + 1),
         she = (she + 1) / sum(she + 1),
         log_ratio = log2(she / he),
         abs_ratio = abs(log_ratio)) %>%
  arrange(desc(log_ratio))

#Filtering out several unnecessary words (to avoid redundant bigrams like "she herself")
he_she_counts_shakespeare = he_she_counts_shakespeare %>%
  filter(!word2 %in% c("himself", "herself", "she", "he"),
         total>= 50) 

#Preparing the data for plotting
he_she_counts_shakespeare = he_she_counts_shakespeare %>%
  mutate(abslogratio = abs(log_ratio)) %>%
  group_by(log_ratio < 0) %>%
  top_n(20, abslogratio) %>% #choosing the number of words to display
  ungroup() %>%
  mutate(word = reorder(word2, log_ratio))

#Plotting
plot_shakespeare = ggplot(he_she_counts_shakespeare, aes(word, log_ratio, color=log_ratio < 0)) +
  geom_segment(aes(x=word, xend=word,
                   y=0, yend=log_ratio), 
               size=1.1, alpha=0.6) +
  geom_point(size=3.5) +
  coord_flip() +
  labs(x = NULL, 
       y = "Relative appearance after 'she' compared to 'he'",
       title = "Words paired with he/she",
       subtitle = "William Shakespeare's works") +
  scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
  scale_y_continuous(breaks = seq(-3, 3),
                     labels = c("0.125x", "0.25x", "0.5x", 
                                "Same", "2x", "4x", "8x"))
plot_shakespeare

bigrams02 = bronte_gutenberg %>%
  unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE)
bigrams_separated02 = bigrams02 %>%
  separate(bigram, c("word1", "word2"), sep=" ")
he_she_words02 = bigrams_separated02 %>%
  filter(word1 %in% c("he", "she"))

#Finding the ratio of words paired with 'he' or 'she'
he_she_counts_bronte = he_she_words02 %>%
  count(word1, word2) %>%
  spread(word1, n, fill = 0) %>%
  mutate(total = he + she,
         he = (he + 1) / sum(he + 1),
         she = (she + 1) / sum(she + 1),
         log_ratio = log2(she / he),
         abs_ratio = abs(log_ratio)) %>%
  arrange(desc(log_ratio))

#Filtering out several unnecessary words (to avoid redundant bigrams like "she herself")
he_she_counts_bronte = he_she_counts_bronte %>%
  filter(!word2 %in% c("himself", "herself", "she", "he"),
         total>= 50) 

#Preparing the data for plotting
he_she_counts_bronte = he_she_counts_bronte %>%
  mutate(abslogratio = abs(log_ratio)) %>%
  group_by(log_ratio < 0) %>%
  top_n(20, abslogratio) %>% #choosing the number of words to display
  ungroup() %>%
  mutate(word = reorder(word2, log_ratio))

#Plotting
plot_bronte = ggplot(he_she_counts_bronte, aes(word, log_ratio, color=log_ratio < 0)) +
  geom_segment(aes(x=word, xend=word,
                   y=0, yend=log_ratio), 
               size=1.1, alpha=0.6) +
  geom_point(size=3.5) +
  coord_flip() +
  labs(x = NULL, 
       y = "Relative appearance after 'she' compared to 'he'",
       title = "Words paired with he/she",
       subtitle = "Charlotte Bronte's novels") +
  scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
  scale_y_continuous(breaks = seq(-3, 3),
                     labels = c("0.125x", "0.25x", "0.5x", 
                                "Same", "2x", "4x", "8x"))
plot_bronte