########################################################################################################
# Vaccine Confidence and Hesitancy at the start of COVID-19 vaccine deployment in the UK: Evidence from a rapid mixed-methods survey
# Authors : Chrissy h Roberts (1)*, Hannah Brindle (1), Nina T Rogers(2), Rosalind M Eggo (1), Luisa Enria (1) and Shelley Lees (1)
# Affiliations : (1)  London School of Hygiene and Tropical Medicine, London, United Kingdom
#                (2)  MRC Epidemiology Unit, University of Cambridge School of Clinical Medicine, United Kingdom
#
# * Corresponding author : Chrissy.Roberts@LSHTM.ac.uk
#
# R Scripts for "Vaccine Confidence and Hesitancy at the start of COVID-19 vaccine deployment in the UK:
# Evidence from a rapid mixed-methods survey" (c) by Chrissy h Roberts.
# This work is licensed under a
# Creative Commons Attribution 4.0 International License.
# You should have received a copy of the license along with this
# work. If not, see <http://creativecommons.org/licenses/by/4.0/>.
########################################################################################################

########################################################################################################
# Set seed for reproducible analysis
########################################################################################################
set.seed(21345)

########################################################################################################
# Load Libraries
########################################################################################################

{
  library(ranger)
  require(mice)
  library(gridExtra)
  library(stm)
  library(tidystm)
  library(tidytext)
  library(data.table)
  library(ggplot2)
  library(dplyr)
  library(summarytools)
  library(tidyverse)
  library(reshape2)
  library(gt)
  library(kableExtra)
  library(arsenal)
  library(tm)
  library(plotly)
  library(nnet)
  library(rgdal)
  library(maptools)
  library(leaflet)
  library(sp)
  library(reshape2)
  library(htmlwidgets)
  library(RColorBrewer)
  library(tidytext)
  library(sjmisc)
  library(arsenal)
  library(gtsummary)
  library(flextable)
}


df<-read.csv("LSHTM_COVID_19_SURVEY_VACCINES_ROBERTS_2020-12-18b.csv",stringsAsFactors = T)

df<-df %>% mutate(
                  dem.education=factor(dem.education,levels=c("school","further","higher")),
                  dem.employment=factor(dem.employment,levels=c("fulltime","parttime","retired","student","homemaker","unemployed")),
                  eth.ethnicity=factor(eth.ethnicity,levels=c("White","Ethnic Minority")),
                  gov_dec.gov_right_decision=factor(gov_dec.gov_right_decision,levels=c("yes","no")),
                  trustgov=factor(trustgov,c("always","mostly","sometimes","almost_never","never")),
                  depression.depressed_recent=factor( depression.depressed_recent,levels=c("not_at_all","several_days","half_the_days","every_day")),
                  smoking.smoking_recent=factor(smoking.smoking_recent,levels=c("non_smoker","light_smoker","moderate_smoker","heavy_smoker")),
                  demographics3.income=factor(demographics3.income,levels=c("Less than £15,000","£15,000 - £24,999","£25,000 - £39,999","£40,000 - £59,999","£60,000 - £99,999","More than £100,000")),
                  demographics3.political_opinion=factor(demographics3.political_opinion,levels=c("floating","conservative","liberal")),
                  region=relevel(region,ref = "London"),
                  health_overall=factor(health_overall,levels=c("very_good","good","fair","bad","very_bad"))
                  )


#################################################################
# Make set of descriptive tables (i.e. with imputed data)
########################################################################################################


tab1 <- dfSummary(df, max.distinct.values = 5, style = "grid",graph.col = F,)
summarytools::view(tab1, file = "tab1.md")
summarytools::view(tab1, file = "tab1.html")

tab1

########################################################################################################
# table 2 - differences after covid19
########################################################################################################

tab2 <- tableby(vaccination.vaccinated ~ ., data=df,cat.stats = c("countrowpct"))
tab2summary<-as.data.frame(summary(tab2))
names(tab2summary)[1]<-"factor"
tab2summary<-tab2summary[,c(1,4,2,3)]

tab2b <- tableby(vaccination.vaccinated ~ ., data=df,cat.stats = c("countpct"))
tab2bsummary<-as.data.frame(summary(tab2b))
names(tab2bsummary)[1]<-"factor"

tab2summary$`Total (N=4535)`<-tab2bsummary$`Total (N=4535)`

tab2summary<-flextable(tab2summary)
save_as_html(tab2summary, path = "tab2.html")


########################################################################################################
# Univariate Analysis
########################################################################################################
df<-mutate(df,vaccination.vaccinated=factor(vaccination.vaccinated))
gender<-coef(summary(glm(data = df,formula = vaccination.vaccinated~dem.gender,family="binomial")))
age<-coef(summary(glm(data = df,formula = vaccination.vaccinated~dem.age,family="binomial")))

set.seed(21345)
univariate_results<-select(df,-vaccination.vaccinated_yes,-vaccination.vaccinated_no) %>%
  tbl_uvregression(
    method = glm,
    y = vaccination.vaccinated,
    method.args = list(family = binomial),
    exponentiate = TRUE,
    pvalue_fun = ~style_pvalue(.x, digits = 2)
  ) %>%

  add_global_p() %>%  # add global p-value
  add_nevent() %>%    # add number of events of the outcome
  add_q() %>%         # adjusts global p-values for multiple testing
  bold_p() %>%        # bold p-values under a given threshold (default 0.05)
  bold_p(t = 0.10, q = TRUE) %>% # now bold q-values under the threshold of 0.10
  bold_labels()

univariate_results<-univariate_results %>%as_flex_table()
save_as_html(univariate_results, path = "tab3.html")


########################################################################################################
# Multivariate Analysis
########################################################################################################

set.seed(21345)
df.multivariate<-select(df,
                                   dem.gender,
                                   dem.age,
                                   dem.education,
                                   dem.employment,
                                   dem.disabled,
                                   eth.ethnicity,
                                   covid.symptoms,
                                   gov_dec.gov_right_decision,
                                   trustgov,
                                   health_overall,
                                   depression.depressed_recent,
                                   smoking.smoking_recent,
                                   specific_conditions.hypertension,
                                   specific_conditions.obesity,
                                   vaccination.feelings,
                                   vaccination.flu_vaccine,
                                   demographics3.income,
                                   demographics3.hardship,
                                   demographics3.political_opinion,
                                   vaccination.vaccinated
                                     )
m1 <- glm(vaccination.vaccinated ~ ., data = df.multivariate, family = binomial)
summary(m1)$coefficients
table4<-tbl_regression(m1, exponentiate = TRUE)
multivariate_results<-table4 %>%as_flex_table()
save_as_html(multivariate_results, path = "tab4.html")

########################################################################################################
# Calculate 95% CIs
########################################################################################################

test<-summary(m1)
test<-as.data.frame(coefficients(test))

test$OR<-exp(as.numeric(test$Estimate))
CIs<-confint(m1)
CIs<-exp(CIs)
test<-cbind(test,CIs)
test$variable<-rownames(test)

########################################################################################################
# Checking the model
########################################################################################################
model.summary<-test
model.summary<-select(model.summary,variable,OR,"2.5 %","97.5 %","Pr(>|z|)")

########################################################################################################
# Add some reference points
########################################################################################################

model.summary$variable<-as.character(model.summary$variable)
model.summary[nrow(model.summary)+1,]<-c("dem.genderFemale (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("dem.age18-34 (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("dem.educationschool (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("dem.employmentfulltime (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("eth.ethnicityWhite (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("trustgovalways (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("depression.depressed_recentnot_at_all (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("smoking.smoking_recentnon_smoker	(ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("demographics3.incomeLess than £15,000 (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("demographics3.political_opinioncentre (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("vaccination.flu_vaccineAlways gets flu vaccine (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("vaccination.feelingsNo Vaccine concerns (ref)",1,1,1,1)
model.summary[nrow(model.summary)+1,]<-c("healthoverallvery_good (ref)",1,1,1,1)


########################################################################################################
# Remap variables so that they print nicely
########################################################################################################

model.summary$variable<-factor(model.summary$variable)
model.summary.plot<-model.summary
model.summary.plot<-arrange(model.summary,variable)
model.summary.plot$OR<-as.numeric(model.summary.plot$OR)
model.summary.plot$`2.5 %`<-as.numeric(model.summary.plot$`2.5 %`)
model.summary.plot$`97.5 %`<-as.numeric(model.summary.plot$`97.5 %`)


########################################################################################################
# Create the ggplot of Odds Ratios for rejecting vaccine
########################################################################################################


ggplot(model.summary.plot,aes(x = fct_rev(variable),y=OR))+
  geom_point(aes(),position = position_dodge(width=0.5))+theme_bw()+
  theme(axis.text=element_text(size=10),
        axis.title=element_text(size=10),
        axis.text.x = element_text(angle = 90, hjust = 1),
        plot.title = element_text(size=10),
        axis.title.y=element_blank(),
        legend.position="none"
  )+

  geom_hline(yintercept=1,lty=2,col="Red")+
  scale_y_log10(limits=c(0.1,40),labels = c(0.01,0.1,0.25,0.5,1,2,5,10,20,40),breaks = c(0.01,0.1,0.25,0.5,1,2,5,10,20,40))+
  geom_errorbar(aes(x=variable, ymax=`97.5 %`, ymin=`2.5 %`), na.rm=TRUE, position=position_dodge(),width=0.3,lty=1)+
  coord_flip()


ggsave("fig1.tiff")
ggsave("fig1.png")
ggsave("fig1.pdf")


########################################################################################################
# Make df for each group, only include people who gave an answer to the free text question
########################################################################################################
df.stm.reject<-df[which(df$vaccination.vaccinated=="Reject_Vaccine"),]
df.stm.reject<-select(df.stm.reject,-vaccination.vaccinated_yes)

df.stm.accept<-df[which(df$vaccination.vaccinated=="Accept_Vaccine"),]
df.stm.accept<-select(df.stm.accept,-vaccination.vaccinated_no)


########################################################################################################
########################################################################################################
# Run STM on group who would reject vaccine
########################################################################################################
########################################################################################################


########################################################################################################
# Create corpus
########################################################################################################

{
set.seed(21345)
processed <- textProcessor(df.stm.reject$vaccination.vaccinated_no, metadata = df.stm.reject,stem = FALSE,customstopwords = c("dont","etc","don't","things","also","can","will","don’t","way","made","just","one","seem","much","avoid","feel","need","else","one","vaccines","yet","want","vaccination","vaccine","vaccinations","vaccinated","covid"))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <-out$meta
}

########################################################################################################
# find best number of topics
########################################################################################################
{
set.seed(21345)
storage <- searchK(out$documents, out$vocab, K = c(2:10,15,20,25,30,35,40), data = meta,cores = 15)
}

########################################################################################################
# add a chart of results based on this - [https://juliasilge.com/blog/evaluating-stm/](https://juliasilge.com/blog/evaluating-stm/)
########################################################################################################

pdf("vaccine_reject_searchK.pdf",useDingbats = F)
plot(storage)
dev.off()

########################################################################################################
# choose a number of topics that maintains high semantic coherence whilst maximising value of the held-out likelihood and minimising residuals
# run the stm with the optimal number of topics (add more covariates that could affect prior probabilty of topics being discussed by using the prevalence command. Add more using covariates=~ dem.age2 + dem.gender + ...)
########################################################################################################

{
set.seed(21345)
fit <- stm(documents = out$documents, vocab = out$vocab,K = 20,max.em.its = 1500, data = out$meta,init.type = "Spectral")
}

########################################################################################################
#ANALYSIS AND VISUALISATION (COMMANDS IN PARENTHESIS)
# Displaying words associated with topics (labelTopics, plot.STM(,type = "labels"), sageLabels, plot.STM(,type = "perspectives")) or documents highly associated with particular topics (findThoughts, plotQuote).
# Make a data table of the metadata topic proportions
########################################################################################################

topicprop<-make.dt(fit, meta)

########################################################################################################
# Plot the map estimates of document-topic proportions
########################################################################################################

plot.STM(fit, "hist")

########################################################################################################
# plot a summary of the topics, with 5 top words
########################################################################################################

plot.STM(fit, "summary", n=5)# distribution and top 5 words per topic

########################################################################################################
# Also show the frequencies of the key words between different topics
########################################################################################################

td_beta <- tidytext::tidy(fit)
options(repr.plot.width=7, repr.plot.height=8, repr.plot.res=100)

td_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  theme_bw() +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  labs(x = NULL, y = expression(beta),
       title = "Highest word probabilities for each topic for those who would not accept a COVID-19 vaccine",
       subtitle = "Different words are associated with different topics")
ggsave("vaccine_reject_word_content.pdf")


########################################################################################################
# you can also show detailed wordlist for specific topic
# would be good to go through each topic and identify excess stopwords and add to the custom list above
# at this stage you might want to remove words that are not semantically coherent. Go back and add some custom stopwords to the stm function
# may have to iterate on this process
# The theta object in the stm model output has the posterior probability of a topic given a document that this function uses. i.e. theta is the probability that a response belongs to a specific topic
# if wanted, plot out a selection (here 1:100) of documents and look at how theta patterns = theta is the probability of a document being in that topic
# if you don't have fairly strong theta for one or two topics, maybe reduce number of topics to enhance semantic coherence
########################################################################################################


########################################################################################################
# Plot theta values for each topic
########################################################################################################

td_theta <- tidytext::tidy(fit, matrix = "theta")

selectiontdthteta<-td_theta[td_theta$document%in%c(1:4000),]#select the first 30 documents. be careful to select a sensible interval, as attempting to load a very huge corpus might crash the kernel

thetaplot1<-ggplot(selectiontdthteta, aes(y=gamma, x=document, fill = as.factor(topic))) +
  geom_bar(stat="identity",alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Theta values per document",
       y = expression(theta), x = "Topic")


thetaplot1
ggsave("vaccine_reject_theta_scores.pdf")


########################################################################################################
# worth considering if any of the topics have few documents that have high theta values. For instance here, topic 5 and topic 7 seem less focussed in terms of theta
# To examine documents that are highly representative of topics the findThoughts function
# This function will print the documents highly associated with each topic
########################################################################################################

thoughts<-findThoughts(model = fit,texts = as.character(meta$vaccination.vaccinated_no), n=100, topics = 1:20,thresh=0.3)$docs

#write.csv(thoughts,"thoughts_reject.csv",quote=T)
write.csv(thoughts$`Topic 1`,"thoughts_reject_A.csv",quote = T)
write.csv(thoughts$`Topic 2`,"thoughts_reject_Bi.csv",quote = T)
write.csv(thoughts$`Topic 3`,"thoughts_reject_C.csv",quote = T)
write.csv(thoughts$`Topic 4`,"thoughts_reject_D.csv",quote = T)
write.csv(thoughts$`Topic 5`,"thoughts_reject_E.csv",quote = T)
write.csv(thoughts$`Topic 6`,"thoughts_reject_F.csv",quote = T)
write.csv(thoughts$`Topic 7`,"thoughts_reject_G.csv",quote = T)
write.csv(thoughts$`Topic 8`,"thoughts_reject_Bii.csv",quote = T)
write.csv(thoughts$`Topic 9`,"thoughts_reject_Hi.csv",quote = T)
write.csv(thoughts$`Topic 10`,"thoughts_reject_I.csv",quote = T)
write.csv(thoughts$`Topic 11`,"thoughts_reject_J.csv",quote = T)
write.csv(thoughts$`Topic 12`,"thoughts_reject_Biii.csv",quote = T)
write.csv(thoughts$`Topic 13`,"thoughts_reject_K.csv",quote = T)
write.csv(thoughts$`Topic 14`,"thoughts_reject_L.csv",quote = T)
write.csv(thoughts$`Topic 15`,"thoughts_reject_Biv.csv",quote = T)
write.csv(thoughts$`Topic 16`,"thoughts_reject_M.csv",quote = T)
write.csv(thoughts$`Topic 17`,"thoughts_reject_N.csv",quote = T)
write.csv(thoughts$`Topic 18`,"thoughts_reject_Hii.csv",quote = T)
write.csv(thoughts$`Topic 19`,"thoughts_reject_O.csv",quote = T)
write.csv(thoughts$`Topic 20`,"thoughts_reject_P.csv",quote = T)


########################################################################################################
# Plot Correlation chart
########################################################################################################

fit.corr<- topicCorr(fit,cutoff = 0.25)
pdf("thoughts_reject_correlations.pdf",useDingbats = F)
plot(fit.corr)
dev.off()
plot(fit.corr)


########################################################################################################
# Make table of topics and add titles (Manual process based on reading quotes)
########################################################################################################

reject.topics<-1:20
reject.topics<-as.data.frame(reject.topics)
names(reject.topics)<-"Assignment"
reject.topics$Topic<-NA
reject.topics$Title<-NA
reject.topics$Topic<-c("A","Bi","C","D","E","F","G","Bii","H","I","J","Biii","K","L","Biv","M","N","Hii","O","P")

reject.topics$n.0.3<-c(25,6,11,18,21,10,19,21,6,18,25,22,13,29,12,24,9,16,12,4)
reject.topics$Title[1]<-"Healthy, so can rely on own immune system"
reject.topics$Title[2]<-"No trust in government, pharma industry and vaccine"
reject.topics$Title[3]<-"Concerns about unknown safety and effectiveness"
reject.topics$Title[4]<-"Lack of need for COVID-19 vaccine given low overall mortality rate"
reject.topics$Title[5]<-"Undecided and/or concerned about side-effects and personal medical history"
reject.topics$Title[6]<-"Risks of relatively untested vaccine Vs benefits of vaccinating generally healthy people"
reject.topics$Title[7]<-"Contextualises COVID-19 vaccines in personal history of flu vaccines and adverse reactions"
reject.topics$Title[8]<-"Mistrust of pharma and government in context of rapid development of vaccines"
reject.topics$Title[9]<-"Concerns about side effects"
reject.topics$Title[10]<-"Previous bad reactions to vaccines and/or allergies to penicillin"
reject.topics$Title[11]<-"Concerned about quality of research during rapid vaccine development"
reject.topics$Title[12]<-"Vaccines have not been tested enough"
reject.topics$Title[13]<-"Pregnant, breastfeeding, ineligible or unwilling to be vaccinated"
reject.topics$Title[14]<-"Potentially willing to be vaccinated, but expressed hesitancy or concerns"
reject.topics$Title[15]<-"Not enough testing, specific concerns of adverse reactions and unforeseen complications"
reject.topics$Title[16]<-"Allergies and reactions to medicines"
reject.topics$Title[17]<-"References to previously failed trials, legal cases and medical scandals"
reject.topics$Title[18]<-"Concerns about long term side effects"
reject.topics$Title[19]<-"Ethical or medical oncerns about derivation, formulation and effects of vaccine"
reject.topics$Title[20]<-"[Fewer than five quotes with theta > 0.3]"

reject.topics<-arrange(reject.topics,Topic)
reject.topics<-flextable(reject.topics)
save_as_html(reject.topics, path = "tab5.html")


########################################################################################################
########################################################################################################
# RUN STM on subset that planned to accept vaccine
########################################################################################################
########################################################################################################


########################################################################################################
# Create corpus
########################################################################################################

{
set.seed(21345)
processed <- textProcessor(df.stm.accept$vaccination.vaccinated_yes, metadata = df.stm.accept,stem = FALSE,customstopwords = c("dont","etc","don't","things","also","can","will","don’t","way","made","just","one","seem","much","avoid","feel","need","else","one","vaccines","yet","want","vaccination","vaccine","vaccinations","vaccinated","covid"))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <-out$meta
}

########################################################################################################
# Find best number of topics
########################################################################################################

{
set.seed(21345)
storage2 <- searchK(out$documents, out$vocab, K = c(2:10,15,20,25,30,35,40), data = meta,cores = 15)
}


########################################################################################################
# Add a chart of results based on this - [https://juliasilge.com/blog/evaluating-stm/](https://juliasilge.com/blog/evaluating-stm/)
########################################################################################################

plot(storage2)
pdf("trust_level_change_searchK.pdf",useDingbats = F)
plot(storage2)
dev.off()

########################################################################################################
# choose a number of topics that maintains high semantic coherence whilst maximising value of the held-out likelihood and minimising residuals
# run the stm with the optimal number of topics (add more covariates that could affect prior probabilty of topics being discussed by using the prevalence command. Add more using covariates=~ dem.age2 + dem.gender + ...)
########################################################################################################

{
set.seed(21345)
fit2 <- stm(documents = out$documents, vocab = out$vocab,K = 25,max.em.its = 1500, data = out$meta,init.type = "Spectral")
}


########################################################################################################
#ANALYSIS AND VISUALISATION (COMMANDS IN PARENTHESIS)
# Displaying words associated with topics (labelTopics, plot.STM(,type = "labels"), sageLabels, plot.STM(,type = "perspectives")) or documents highly associated with particular topics (findThoughts, plotQuote).
# Show words associated with topics
# After reading the thoughts document you can update this with a list of text like c("T1 : Stuff","T2 : Other Stuff")
########################################################################################################


########################################################################################################
# Make a data table of the metadata topic proportions
########################################################################################################

topicprop2<-make.dt(fit2, meta)

########################################################################################################
# Plot the map estimates of document-topic proportions
########################################################################################################

plot.STM(fit2, "hist")


########################################################################################################
# Plot a summary of the topics, with 5 top words
########################################################################################################

plot.STM(fit2, "summary", n=5)# distribution and top 5 words per topic

########################################################################################################
# Also show the frequencies of the key words between different topics
########################################################################################################
td_beta <- tidytext::tidy(fit2)
options(repr.plot.width=7, repr.plot.height=8, repr.plot.res=100)

td_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  theme_bw() +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  labs(x = NULL, y = expression(beta),
       title = "Highest word probabilities for each topic for those who would accept a COVID-19 vaccine",
       subtitle = "Different words are associated with different topics")
ggsave("vaccine_accept_word_content.pdf")

########################################################################################################
# you can also show detailed wordlist for specific topic
# would be good to go through each topic and identify excess stopwords and add to the custom list above
# at this stage you might want to remove words that are not semantically coherent. Go back and add some custom stopwords to the stm function
# may have to iterate on this process
# The theta object in the stm model output has the posterior probability of a topic given a document that this function uses. i.e. theta is the probability that a response belongs to a specific topic
# if wanted, plot out a selection (here 1:100) of documents and look at how theta patterns = theta is the probability of a document being in that topic
# if you don't have fairly strong theta for one or two topics, maybe reduce number of topics to enhance semantic coherence
########################################################################################################

########################################################################################################
# Plot the theta values for each topic
########################################################################################################

td_theta <- tidytext::tidy(fit2, matrix = "theta")

selectiontdthteta<-td_theta[td_theta$document%in%c(1:4000),]#select the first 30 documents. be careful to select a sensible interval, as attempting to load a very huge corpus might crash the kernel

thetaplot1<-ggplot(selectiontdthteta, aes(y=gamma, x=document, fill = as.factor(topic))) +
  geom_bar(stat="identity",alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Theta values per document",
       y = expression(theta), x = "Topic")


thetaplot1
ggsave("vaccine_accept_theta_scores.pdf")


########################################################################################################
# Worth considering if any of the topics have few documents that have high theta values. For instance here, topic 5 and topic 7 seem less focussed in terms of theta
# To examine documents that are highly representative of topics the findThoughts function
# This function will print the documents highly associated with each topic
########################################################################################################

thoughts<-findThoughts(model = fit2,texts = as.character(meta$vaccination.vaccinated_yes), n=100, topics = 1:25,thresh=0.3)$docs

write.csv(thoughts$`Topic 1`,"thoughts_accept_T01.csv",quote = T)
write.csv(thoughts$`Topic 2`,"thoughts_accept_T02i.csv",quote = T)
write.csv(thoughts$`Topic 3`,"thoughts_accept_T03.csv",quote = T)
write.csv(thoughts$`Topic 4`,"thoughts_accept_T04i.csv",quote = T)
write.csv(thoughts$`Topic 5`,"thoughts_accept_T05i.csv",quote = T)
write.csv(thoughts$`Topic 6`,"thoughts_accept_T06.csv",quote = T)
write.csv(thoughts$`Topic 7`,"thoughts_accept_T07.csv",quote = T)
write.csv(thoughts$`Topic 8`,"thoughts_accept_T05ii.csv",quote = T)
write.csv(thoughts$`Topic 9`,"thoughts_accept_T04ii.csv",quote = T)
write.csv(thoughts$`Topic 10`,"thoughts_accept_T08.csv",quote = T)
write.csv(thoughts$`Topic 11`,"thoughts_accept_T04iii.csv",quote = T)
write.csv(thoughts$`Topic 12`,"thoughts_accept_T04iv.csv",quote = T)
write.csv(thoughts$`Topic 13`,"thoughts_accept_T09.csv",quote = T)
write.csv(thoughts$`Topic 14`,"thoughts_accept_T05iii.csv",quote = T)
write.csv(thoughts$`Topic 15`,"thoughts_accept_T04v.csv",quote = T)
write.csv(thoughts$`Topic 16`,"thoughts_accept_T10.csv",quote = T)
write.csv(thoughts$`Topic 17`,"thoughts_accept_T11.csv",quote = T)
write.csv(thoughts$`Topic 18`,"thoughts_accept_T12.csv",quote = T)
write.csv(thoughts$`Topic 19`,"thoughts_accept_T05iv.csv",quote = T)
write.csv(thoughts$`Topic 20`,"thoughts_accept_T13.csv",quote = T)
write.csv(thoughts$`Topic 21`,"thoughts_accept_T14.csv",quote = T)
write.csv(thoughts$`Topic 22`,"thoughts_accept_T15.csv",quote = T)
write.csv(thoughts$`Topic 23`,"thoughts_accept_T02ii.csv",quote = T)
write.csv(thoughts$`Topic 24`,"thoughts_accept_T16.csv",quote = T)
write.csv(thoughts$`Topic 25`,"thoughts_accept_T05v.csv",quote = T)


########################################################################################################
# Plot the correlation chart
########################################################################################################

fit.corr2<- topicCorr(fit2,cutoff = 0.25)
plot(fit.corr2)
pdf("thoughts_accept_correlations.pdf",useDingbats = F)
plot(fit.corr2)
dev.off()


########################################################################################################
# Make table of topics and add titles (manual process based on reading quotes)
########################################################################################################

accept.topics<-1:25
accept.topics<-as.data.frame(accept.topics)
names(accept.topics)<-"Assignment"
accept.topics$Topic<-NA
accept.topics$Title<-NA
accept.topics$Topic<-c("1","2i","3","4i","5i","6","7","5ii","4ii","8","4iii","4iv","9","5iii","4v","10","11","12","5iv","13","14","15","2ii","16","5v")

accept.topics$n.0.3<-c(37,40,39,55,0,52,13,0,7,58,1,7,27,2,0,20,17,34,8,54,10,13,1,72,1)
accept.topics$Title[1]<-"Recognises value of vaccinations and compares COVID-19 to flu vaccines"
accept.topics$Title[2]<-"Wants to get back to normal life"
accept.topics$Title[3]<-"Aware of risks and concerns of others, but on balance sees benefit of COVID-19 vaccines"
accept.topics$Title[4]<-"Wants to protect loved ones / vulnerable people, and travel to see friends and family"
accept.topics$Title[5]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[6]<-"Trusts the science, scientists and the scientific process"
accept.topics$Title[7]<-"Has some concerns, but on balance believes acceptance the correct choice"
accept.topics$Title[8]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[9]<-"Protect self and others"
accept.topics$Title[10]<-"To help build herd immunity"
accept.topics$Title[11]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[12]<-"Wishes to mix in social, cultural and work contexts"
accept.topics$Title[13]<-"Recognises vaccination as a social responsibility and public good"
accept.topics$Title[14]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[15]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[16]<-"Healthcare professionals and/or visit care homes"
accept.topics$Title[17]<-"Responsibilities to the wider community"
accept.topics$Title[18]<-"Works in NHS, Schools, Pharma and with vulnerable people"
accept.topics$Title[19]<-"To prevent contracting and spreading infection"
accept.topics$Title[20]<-"Participant and/or family member in high risk group because of age or medical condition"
accept.topics$Title[21]<-"Vaccination to be safe and make others safe, some with negative sentiments on UK government"
accept.topics$Title[22]<-"Prevention better than cure, referencing success of historical vaccine programmes"
accept.topics$Title[23]<-"[Fewer than five quotes with theta > 0.3]"
accept.topics$Title[24]<-"It is the sensible and responsible thing to do"
accept.topics$Title[25]<-"[Fewer than five quotes with theta > 0.3]"

accept.topics<-arrange(accept.topics,Topic)
accept.topics<-flextable(accept.topics)
save_as_html(accept.topics, path = "tab6.html")
accept.topics
##############################################################################


##############################################################################
# END OF ANALYSIS
##############################################################################