#This exercise includes code adapted from code written by creators of the 'tidytext' R package: David Robinson (http://varianceexplained.org/r/tidytext-gender-plots/) and Julia Silge (https://juliasilge.com/blog/gender-pronouns/) #Welcome to R. If you're already familiar you can skip ahead to step 0 or 1. #This document is an "R script" which provides commands that R can run. Anything written after the hash is ignored by R. #This means anything after a hash is instructions or information for you. #R is a 'command line' program which means you tell the program what to do by typing commands #For example to multiply 8 by 32 type: 8*32 #Click run with your cursor on this line to 'run' the command #You should be able to see the answer (256) in the 'console' below #you can also use commands to create 'objects' that are then listed in the 'global environment' to the right. #These objects can then be used in other commands. Use the command below to make an object which is the total of 8 and 32: Total = 8+32 #remember, click 'run' with your cursor on this line to run the command #You should see the Total listed to the right now. #The 'Total' object can be used in new commands like below: NewTotal = 8*(Total) AnotherTotal = (NewTotal)+(Total) #Now you understand the basics you can start. Don't worry too much, in this exercise you will be running code rather than #writing it. #0. install the packages below if you have not already. To install the package click 'run' with your cursor on the line of code install.packages("tidyverse")#set of packages for data manipulation and visualization (in particular, the visualization package ggplo2) install.packages("tidytext")#package for convenient ("tidy") text mining in R install.packages("gutenbergr")#package for easy downloading books from Gutenberg Project collection #1. Load the installed packages to your library #install these packages if necessary library(tidyverse) #set of packages for data manipulation and visualization (in particular, the visualization package ggplo2) library(tidytext) #package for convenient ("tidy") text mining in R library(gutenbergr) #package for easy downloading books from Gutenberg Project collection #2. Preparing two small corpora: Jane Austen and Waler Scott #### #Extracting metadata on Austen and Scott: to be able to get the Gutenberg IDs of their books #Remember, click run with your cursor on the line of code to run the command metadata_austen = gutenberg_works(author == "Austen, Jane") metadata_scott = gutenberg_works(author == "Scott, Walter") #you should now be able to see the meta data for both authors in the global environment to the right -> #if you click on these you can see which books will be used for text mining. #Downloading all the Austen's and Scott's books on Gutenberg, using IDs (this may take a few minutes) austen_gutenberg = gutenberg_download(c(metadata_austen$gutenberg_id)) scott_gutenberg = gutenberg_download(c(metadata_scott$gutenberg_id)) #you should now be able to see objects for each author in the global environment. # 3. Text mining and plotting Austen's novels #### #Now the text is there we can start text mining #Dissecting the texts into bigrams: 2-word combinations bigrams01 = austen_gutenberg %>% unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE)#click run with your cursor on this line to run the command bigrams_separated01 = bigrams01 %>% separate(bigram, c("word1", "word2"), sep=" ") #Selecting only the bigrams the have 'he' or 'she' as their first word (e.g., "she said") he_she_words01 = bigrams_separated01 %>% filter(word1 %in% c("he", "she")) #Finding the ratio of words paired with 'he' or 'she' he_she_counts_austen = he_she_words01 %>% count(word1, word2) %>% spread(word1, n, fill = 0) %>% mutate(total = he + she, he = (he + 1) / sum(he + 1), she = (she + 1) / sum(she + 1), log_ratio = log2(she / he), abs_ratio = abs(log_ratio)) %>% arrange(desc(log_ratio))#run the command from this line #Making plot 1 (Austen) #Filtering out several unnecessary words (to avoid redundant bigrams like "she herself") he_she_counts_austen = he_she_counts_austen %>% filter(!word2 %in% c("himself", "herself", "she", "he"), total>= 50) #Also, setting the minimal number of occurrences for a word #Plotting plot1_austen = ggplot(he_she_counts_austen, aes(total, log_ratio)) + geom_point() + scale_x_log10(breaks = c(100, 1000, 10000, 1e5)) + geom_text(aes(label = word2), vjust=1, hjust=1, check_overlap = TRUE) + scale_y_continuous(breaks = seq(-2, 2), labels = c('4X "he"', '2X "he"', "Same", '2X "she"', '4X "she"')) + labs(x = 'Total uses after "he" or "she" (log scale)', y = 'Relative uses after "she" to after "he"', title = "Words occurring at least 50 times after he/she", subtitle = "Jane Austen's novels") + expand_limits(x = 75)#run the command from this line - this creates the plot plot1_austen #run this command to see the plot #The plot should show in the bottom-right panel. The words in the top half more commonly occur after 'she', the words in the #bottom half more commonly occur after 'he'. #Making plot 2 (Austen) #Preparing the data for plotting he_she_counts_austen = he_she_counts_austen %>% mutate(abslogratio = abs(log_ratio)) %>% group_by(log_ratio < 0) %>% top_n(20, abslogratio) %>% #choosing the number of words to display ungroup() %>% mutate(word = reorder(word2, log_ratio)) #Plotting plot2_austen = ggplot(he_she_counts_austen, aes(word, log_ratio, color=log_ratio < 0)) + geom_segment(aes(x=word, xend=word, y=0, yend=log_ratio), size=1.1, alpha=0.6) + geom_point(size=3.5) + coord_flip() + labs(x = NULL, y = "Relative appearance after 'she' compared to 'he'", title = "Words paired with he/she", subtitle = "Jane Austen's novels") + scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) + scale_y_continuous(breaks = seq(-3, 3), labels = c("0.125x", "0.25x", "0.5x", "Same", "2x", "4x", "8x")) plot2_austen#run this command to create the second plot. This plot is perhaps more intuitive to read. #4. Text mining and plotting Scotts's novels #### #Now lets move onto Walter Scott. This is the same code as before but uses the text from Scott's novels. #Check how this code is different to the above - which aspects have to be changed to use it for a new author? #Dissecting the texts into bigrams: 2-word combinations bigrams02 = scott_gutenberg %>% unnest_tokens(bigram, text, token="ngrams", n=2, collapse=FALSE) bigrams_separated02 = bigrams02 %>% separate(bigram, c("word1", "word2"), sep=" ") #Selecting only the bigrams the have 'he' or 'she' as their first word (e.g., "she said") he_she_words02 = bigrams_separated02 %>% filter(word1 %in% c("he", "she")) #Finding the ratio of words paired with 'he' or 'she' he_she_counts_scott = he_she_words02 %>% count(word1, word2) %>% spread(word1, n, fill = 0) %>% mutate(total = he + she, he = (he + 1) / sum(he + 1), she = (she + 1) / sum(she + 1), log_ratio = log2(she / he), abs_ratio = abs(log_ratio)) %>% arrange(desc(log_ratio)) #Making plot 1 (Scott) #Filtering out several unnecessary words (to avoid redundant bigrams like "she herself") he_she_counts_scott = he_she_counts_scott %>% filter(!word2 %in% c("himself", "herself", "she", "he"), total>= 50) #Also, setting the minimal number of occurrences for a word #Plotting plot1_scott = ggplot(he_she_counts_scott, aes(total, log_ratio)) + geom_point() + scale_x_log10(breaks = c(100, 1000, 10000, 1e5)) + geom_text(aes(label = word2), vjust=1, hjust=1, check_overlap = TRUE) + scale_y_continuous(breaks = seq(-2, 2), labels = c('4X "he"', '2X "he"', "Same", '2X "she"', '4X "she"')) + labs(x = 'Total uses after "he" or "she" (log scale)', y = 'Relative uses after "she" to after "he"', title = "Words occurring at least 50 times after he/she", subtitle = "Walter Scott's novels") + expand_limits(x = 75) plot1_scott #Making plot 2 (Scott) #Preparing the data for plotting he_she_counts_scott = he_she_counts_scott %>% mutate(abslogratio = abs(log_ratio)) %>% group_by(log_ratio < 0) %>% top_n(20, abslogratio) %>% #choosing the number of words to display ungroup() %>% mutate(word = reorder(word2, log_ratio)) #Plotting plot2_scott = ggplot(he_she_counts_scott, aes(word, log_ratio, color=log_ratio < 0)) + geom_segment(aes(x=word, xend=word, y=0, yend=log_ratio), size=1.1, alpha=0.6) + geom_point(size=3.5) + coord_flip() + labs(x = NULL, y = "Relative appearance after 'she' compared to 'he'", title = "Words paired with he/she", subtitle = "Walter Scott's novels") + scale_color_discrete(name = "", labels = c("More 'she'", "More 'he'")) + scale_y_continuous(breaks = seq(-3, 3), labels = c("0.125x", "0.25x", "0.5x", "Same", "2x", "4x", "8x")) plot2_scott #now you have intuitive plots for both authors that allow you to compare them. #You can you this code to text mine the text of any author listed on project Gutenberg #For example an earlier writer: metadata_shakespeare = gutenberg_works(author == "Shakespeare, William") shakespeare_gutenberg = gutenberg_download(c(metadata_shakespeare$gutenberg_id)) #replace scott_gutenberg with shakespeare_gutenberg to create the new code. #or a later writer: metadata_bronte = gutenberg_works(author == "Bront?, Charlotte") bronte_gutenberg = gutenberg_download(c(metadata_bronte$gutenberg_id))