#This exercise includes code adapted from code written by creators of the 'tidytext' R package: David Robinson (http://varianceexplained.org/r/tidytext-gender-plots/) and Julia Silge (https://juliasilge.com/blog/gender-pronouns/)

#Welcome to R. If you're already familiar you can skip ahead to step 0 or 1. 

#This document is an "R script" which provides commands that R can run. Anything written after the hash is ignored by R.
#This means anything after a hash is instructions or information for you.
#R is a 'command line' program which means you tell the program what to do by typing commands
#For example to multiply 8 by 32 type:

8*32 #Click run with your cursor on this line to 'run' the command

#You should be able to see the answer (256) in the 'console' below

#you can also use commands to create 'objects' that are then listed in the 'global environment' to the right.
#These objects can then be used in other commands. Use the command below to make an object which is the total of 8 and 32:

Total = 8+32 #remember, click 'run' with your cursor on this line to run the command

#You should see the Total listed to the right now.
#The 'Total' object can be used in new commands like below:

NewTotal = 8*(Total)

AnotherTotal = (NewTotal)+(Total)

#Now you understand the basics you can start. Don't worry too much, in this exercise you will be running code rather than 
#writing it.


#0. install the packages below if you have not already. To install the package click 'run' with your cursor on the line of code

install.packages("tidyverse")#set of packages for data manipulation and visualization (in particular, the visualization package ggplo2)
install.packages("tidytext")#package for convenient ("tidy") text mining in R
install.packages("gutenbergr")#package for easy downloading books from Gutenberg Project collection

#1. Load the installed packages to your library

#install these packages if necessary
library(tidyverse) #set of packages for data manipulation and visualization (in particular, the visualization package ggplo2)
library(tidytext) #package for convenient ("tidy") text mining in R
library(gutenbergr) #package for easy downloading books from Gutenberg Project collection

#2. Preparing two small corpora: Jane Austen and Waler Scott ####

#Extracting metadata on Austen and Scott: to be able to get the Gutenberg IDs of their books
#Remember, click run with your cursor on the line of code to run the command
metadata_austen = gutenberg_works(author == "Austen, Jane")
metadata_scott = gutenberg_works(author == "Scott, Walter")

#you should now be able to see the meta data for both authors in the global environment to the right ->
#if you click on these you can see which books will be used for text mining. 

#Downloading all the Austen's and Scott's books on Gutenberg, using IDs (this may take a few minutes)
austen_gutenberg = gutenberg_download(c(metadata_austen$gutenberg_id))
scott_gutenberg = gutenberg_download(c(metadata_scott$gutenberg_id))

#you should now be able to see objects for each author in the global environment.


# 3. Text mining and plotting Austen's novels ####

#Now the text is there we can start text mining

#Dissecting the texts into bigrams: 2-word combinations
bigrams01 = austen_gutenberg %>%
  unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE)#click run with your cursor on this line to run the command
bigrams_separated01 = bigrams01 %>%
  separate(bigram, c("word1", "word2"), sep=" ")

#Selecting only the bigrams the have 'he' or 'she' as their first word (e.g., "she said")
he_she_words01 = bigrams_separated01 %>%
  filter(word1 %in% c("he", "she"))

#Finding the ratio of words paired with 'he' or 'she'
he_she_counts_austen = he_she_words01 %>%
  count(word1, word2) %>%
  spread(word1, n, fill = 0) %>%
  mutate(total = he + she,
         he = (he + 1) / sum(he + 1),
         she = (she + 1) / sum(she + 1),
         log_ratio = log2(she / he),
         abs_ratio = abs(log_ratio)) %>%
  arrange(desc(log_ratio))#run the command from this line

#Making plot 1 (Austen)

#Filtering out several unnecessary words (to avoid redundant bigrams like "she herself")
he_she_counts_austen = he_she_counts_austen %>%
  filter(!word2 %in% c("himself", "herself", "she", "he"),
         total>= 50) #Also, setting the minimal number of occurrences for a word
#Plotting
plot1_austen = ggplot(he_she_counts_austen, aes(total, log_ratio)) +
  geom_point() +
  scale_x_log10(breaks = c(100, 1000, 10000, 1e5)) +
  geom_text(aes(label = word2), vjust=1, hjust=1,
            check_overlap = TRUE) +
  scale_y_continuous(breaks = seq(-2, 2),
                     labels = c('4X "he"', '2X "he"', "Same", '2X "she"', '4X "she"')) +
  labs(x = 'Total uses after "he" or "she" (log scale)',
       y = 'Relative uses after "she" to after "he"',
       title = "Words occurring at least 50 times after he/she",
       subtitle = "Jane Austen's novels") +
  expand_limits(x = 75)#run the command from this line - this creates the plot
plot1_austen #run this command to see the plot

#The plot should show in the bottom-right panel. The words in the top half more commonly occur after 'she', the words in the
#bottom half more commonly occur after 'he'. 

#Making plot 2 (Austen)

#Preparing the data for plotting
he_she_counts_austen = he_she_counts_austen %>%
  mutate(abslogratio = abs(log_ratio)) %>%
  group_by(log_ratio < 0) %>%
  top_n(20, abslogratio) %>% #choosing the number of words to display
  ungroup() %>%
  mutate(word = reorder(word2, log_ratio))

#Plotting
plot2_austen = ggplot(he_she_counts_austen, aes(word, log_ratio, color=log_ratio < 0)) +
  geom_segment(aes(x=word, xend=word,
                   y=0, yend=log_ratio), 
               size=1.1, alpha=0.6) +
  geom_point(size=3.5) +
  coord_flip() +
  labs(x = NULL, 
       y = "Relative appearance after 'she' compared to 'he'",
       title = "Words paired with he/she",
       subtitle = "Jane Austen's novels") +
  scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
  scale_y_continuous(breaks = seq(-3, 3),
                     labels = c("0.125x", "0.25x", "0.5x", 
                                "Same", "2x", "4x", "8x"))
plot2_austen#run this command to create the second plot. This plot is perhaps more intuitive to read.


#4. Text mining and plotting Scotts's novels ####
#Now lets move onto Walter Scott. This is the same code as before but uses the text from Scott's novels. 
#Check how this code is different to the above - which aspects have to be changed to use it for a new author?

#Dissecting the texts into bigrams: 2-word combinations
bigrams02 = scott_gutenberg %>%
  unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE)
bigrams_separated02 = bigrams02 %>%
  separate(bigram, c("word1", "word2"), sep=" ")

#Selecting only the bigrams the have 'he' or 'she' as their first word (e.g., "she said")
he_she_words02 = bigrams_separated02 %>%
  filter(word1 %in% c("he", "she"))

#Finding the ratio of words paired with 'he' or 'she'
he_she_counts_scott = he_she_words02 %>%
  count(word1, word2) %>%
  spread(word1, n, fill = 0) %>%
  mutate(total = he + she,
         he = (he + 1) / sum(he + 1),
         she = (she + 1) / sum(she + 1),
         log_ratio = log2(she / he),
         abs_ratio = abs(log_ratio)) %>%
  arrange(desc(log_ratio))

#Making plot 1 (Scott)

#Filtering out several unnecessary words (to avoid redundant bigrams like "she herself")
he_she_counts_scott = he_she_counts_scott %>%
  filter(!word2 %in% c("himself", "herself", "she", "he"),
         total>= 50) #Also, setting the minimal number of occurrences for a word

#Plotting
plot1_scott = ggplot(he_she_counts_scott, aes(total, log_ratio)) +
  geom_point() +
  scale_x_log10(breaks = c(100, 1000, 10000, 1e5)) +
  geom_text(aes(label = word2), vjust=1, hjust=1,
            check_overlap = TRUE) +
  scale_y_continuous(breaks = seq(-2, 2),
                     labels = c('4X "he"', '2X "he"', "Same", '2X "she"', '4X "she"')) +
  labs(x = 'Total uses after "he" or "she" (log scale)',
       y = 'Relative uses after "she" to after "he"',
       title = "Words occurring at least 50 times after he/she",
       subtitle = "Walter Scott's novels") +
  expand_limits(x = 75)
plot1_scott


#Making plot 2 (Scott)

#Preparing the data for plotting
he_she_counts_scott = he_she_counts_scott %>%
  mutate(abslogratio = abs(log_ratio)) %>%
  group_by(log_ratio < 0) %>%
  top_n(20, abslogratio) %>% #choosing the number of words to display
  ungroup() %>%
  mutate(word = reorder(word2, log_ratio))

#Plotting
plot2_scott = ggplot(he_she_counts_scott, aes(word, log_ratio, color=log_ratio < 0)) +
  geom_segment(aes(x=word, xend=word,
                   y=0, yend=log_ratio), 
               size=1.1, alpha=0.6) +
  geom_point(size=3.5) +
  coord_flip() +
  labs(x = NULL, 
       y = "Relative appearance after 'she' compared to 'he'",
       title = "Words paired with he/she",
       subtitle = "Walter Scott's novels") +
  scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) +
  scale_y_continuous(breaks = seq(-3, 3),
                     labels = c("0.125x", "0.25x", "0.5x", 
                                "Same", "2x", "4x", "8x"))
plot2_scott

#now you have intuitive plots for both authors that allow you to compare them. 
#You can you this code to text mine the text of any author listed on project Gutenberg
#For example an earlier writer:

metadata_shakespeare = gutenberg_works(author == "Shakespeare, William")
shakespeare_gutenberg = gutenberg_download(c(metadata_shakespeare$gutenberg_id))

#replace scott_gutenberg with shakespeare_gutenberg to create the new code. 

#or a later writer:

metadata_bronte = gutenberg_works(author == "Bront?, Charlotte")
bronte_gutenberg = gutenberg_download(c(metadata_bronte$gutenberg_id))