###
### Exercise 8: Wordscores
### Quantitative Text Analysis, Essex 2013
### Kenneth Benoit
###

# load the austin library
library(austin)

###
### proof-of-concept data from LBG (2003)
###
# load the data (built in to austin)
data(lbg)
# extract the reference documents as documents 1 through 5
ref <- getdocs(lbg, 1:5)
# set the wordscores as -1.5 through 1.5
ws <- classic.wordscores(ref, scores=c(-1.5, -0.75, 0, 0.75, 1.5))
summary(ws)
# use V1 as the virgin document
vir <- getdocs(lbg, 'V1') 
# score the virgin document
predict(ws, newdata=vir)


###
### score the budget speeches from the budget 2010 debate
###
# load the .csv frequency file, transpose it so that docs=columns
budget2010 <- t(read.csv("/assets/courses/essex2012cta/budget_2010.csv",
                         row.names=1))
# coerce the object into an austin "wfm" object
budget2010 <- as.wfm(budget2010)
# look at the document name list
colnames(budget2010)
# extract 5 and 6 (Cowen and Kenny) as "reference" documents
ref <- getdocs(budget2010, c(5,6))
# set the reference scores at 1 for Cowen and -1 for Kenny
ws <- classic.wordscores(ref, scores=c(1,-1))
# summarize the word scores as assigned
summary(ws)
# now score all documents as "virgin" documents
vir <- getdocs(budget2010, 1:14)
textscores <- predict(ws, newdata=vir)
# sorts the scored documents 
textscores[order(textscores$Score),]
# inspect the word scores
View(ws$pi[order(ws$pi),])

# a plot of the wordscores
plot(ws$pi[order(ws$pi),])
     
# a plot of the document scores
plot(textscores$Score[order(textscores$Score)])