# # Essex Summer School: Quantitative Text Analysis # Code for Exercise 1 # Kenneth Benoit and Paul Nulty # # # Part One: Installing quanteda # # devtools required to install quanteda from Github if (!require(devtools)) install.packages("devtools") library(devtools) # install the latest version of quantedafrom Github install_github("quanteda", username="kbenoit", ref='dev', quick=TRUE) library(quanteda) # # Part Two: Making a corpus from a vector of texts # data(amicusTexts) amiCorp <- corpusCreate(amicusTexts) summary(amiCorp) amiCorp$metadata # # Part Three: Other ways of making a corpus # Analyze Bollinger texts from Evans et al JELS 2007 # load in Amicus texts from a zipped web archive # # 3a: corpus from a directory # download and unzip texts amicusFile <- "/assets/courses/tcd2014qta/exercises/amicus_curiae.zip" download.file(amicusFile, basename(amicusFile)) unzip(basename(amicusFile)) # load in the texts to a vector of texts using quanteda's getTextDir() amicusTexts <- c(getTextDir("./amicus/training"), getTextDir("./amicus/testing")) # change the encoding (because texts contain special symbols such as ยง) amicusTexts <- iconv(amicusTexts, from="latin1", to="UTF-8") # set training class trainclass <- factor(c("P", "R", rep(NA, length(amicusTexts)-2))) # set test class using grep on filenames testclass <- rep(NA, length(amicusTexts)) testclass[grep("AP", names(amicusTexts))] <- "AP" testclass[grep("AR", names(amicusTexts))] <- "AR" # make the corpus from the texts and the attributes amicusCorpus <- corpusCreate(amicusTexts, attribs=list(trainclass=trainclass, testclass=testclass)) summary(amicusCorpus) # # 3d: demonstration of grep command # matches <- grep("equality", amicusCorpus$attribs$texts) # any occurrence of "equality" matchTexts <- amicusCorpus$attribs$texts[matches] matches <- grep("\\bequality\\b", amicusCorpus$attribs$texts) # whole word only matchTexts <- amicusCorpus$attribs$texts[matches] # # 3e,f: Corpus with attributes from filenames # # Windows might need escaped backslashes in file path eg "C:\\Downloads\\texts" iebudCorp <- corpusFromFilenames("S:\\2E_QTA\\budget_2010\\", attNames = c("year","type","number", "fname","lname","party"))