######################################################################################################## # Paper : Vaccine Confidence and Hesitancy at the start of COVID-19 vaccine deployment in the UK: An embedded mixed-methods study # Authors : Chrissy h Roberts[1]*, Hannah Brindle[1], Nina T Rogers[2], Rosalind M Eggo[3], Luisa Enria[4] and Shelley Lees[4] # # Affiliations # 1 Department of Clinical Research, London School of Hygiene and Tropical Medicine, London, UK # 2 MRC Epidemiology Unit, University of Cambridge School of Clinical Medicine, United Kingdom # 3 Department of Infectious Disease Epidemiology, London School of Hygiene and Tropical Medicine, London, UK # 4 Department of Global Health and Development, London School of Hygiene and Tropical Medicine, London, UK # # R Scripts for "Vaccine Confidence and Hesitancy at the start of COVID-19 vaccine deployment in the UK: An embedded mixed-methods study" # (c) Chrissy h Roberts # This work is licensed under a # Creative Commons Attribution 4.0 International License. # You should have received a copy of the license along with this # work. If not, see . ######################################################################################################## ######################################################################################################## # Set seed for reproducible analysis ######################################################################################################## set.seed(21345) ######################################################################################################## # Load Libraries ######################################################################################################## { library(arsenal) library(data.table) library(flextable) library(gridExtra) library(gt) library(gtsummary) library(htmlwidgets) library(kableExtra) library(maptools) library(nnet) library(plotly) library(ranger) library(RColorBrewer) library(reshape2) library(rgdal) library(sjmisc) library(sp) library(stm) library(summarytools) library(tidystm) library(tidytext) library(tidyverse) library(tm) require(mice) } ######################################################################################################## # Tidy and Impute data ######################################################################################################## #LOAD THE DATA df<-read.csv("LSHTM_COVID_19_SURVEY_VACCINES_ROBERTS_2020-12-18_DC.csv", na.strings = c("NA",""),stringsAsFactors = T) df$SubmissionDate<-as.Date(df$SubmissionDate) df<-filter(df,SubmissionDate>"2020-12-08") #SELECT VARIABLES THAT WILL BE USED df<-select(df, -SubmissionDate, -start, -end, -deviceid, -subscriberid, -simserial, -phonenumber, -generated_note_name_10, -agree, dem.gender, dem.age, dem.people_in_home, dem.marital, dem.school_age_kids, dem.education, dem.employment, dem.employment_changed, dem.disabled, eth.ethnicity, -eth.ethnicity_white, -eth.ethnicity_black, -eth.ethnicity_mixed, -eth.ethnicity_asian, -eth.ethnicity_other, covid.symptoms, -covid.symptoms_date, -covid.tested_me, -covid.tested_member, -covid.tested_no, -covid.positive, -symptoms_isolate.note_symptoms_isolate, -symptoms_isolate.reserved_name_for_field_list_labels_41, -symptoms_isolate.adult_symptoms, -symptoms_isolate.children_symptoms, -symptoms_isolate.adult_no_symptoms, -symptoms_isolate.children_no_symptoms, -isolation.time_isolate, -isolation.isolate_comply, -isolation.time_isolate_actual, -isolation.isolation_approach, -app.covid_app, -app.covid_app_whynot, -app.covid_app_concerns, -app.alert, gov_dec.gov_right_decision, -gov_dec.gov_right_control, -gov_dec.gov_wrong_control, -gov_dec.devolved_gov.note_devolved_govs, -gov_dec.devolved_gov.reserved_name_for_field_list_labels_66, -gov_dec.devolved_gov.scotland, -gov_dec.devolved_gov.wales, -gov_dec.devolved_gov.n_ireland, -measures, trustgov, trust.trust_level, -trust.trust_change, -priorities, health_overall, -depression.note_depression, depression.depressed_before, depression.depressed_recent, -experience_lockdown, -phys_activity.note_phys_activity, phys_activity.phys_activity_before, phys_activity.phys_activity_recent, -weight_describe.note_weight, weight_describe.weight_before, weight_describe.weight_recent, -alcohol_1.note_drinking, alcohol_1.alcohol_before, alcohol_1.alcohol_recent, -cage_before.note_CAGE1, -cage_before.cut_down, -cage_before.annoy, -cage_before.guilty, -cage_before.morning, -cage_recent.note_CAGE2, -cage_recent.cut_down2, -cage_recent.annoy2, -cage_recent.guilty2, -cage_recent.morning2, -cage_recent.note_aa, -smoking.note_smoking, smoking.smoking_before, smoking.smoking_recent, -sleep_hours.note_sleep, sleep_hours.sleep_before, sleep_hours.sleep_after, -specific_conditions.note_specific_conditions, -specific_conditions.reserved_name_for_field_list_labels_128, specific_conditions.diabetes_type1, specific_conditions.diabetes_type2, specific_conditions.ashtma, specific_conditions.lungdisease, specific_conditions.cancer, specific_conditions.stroke, specific_conditions.heartdisease, specific_conditions.hypertension, specific_conditions.obesity, specific_conditions.depression, specific_conditions.anxiety, -adls.note_adls, -adls.reserved_name_for_field_list_labels_142, adls.dressing, adls.walking, adls.bathing, adls.eating, adls.bed, adls.toilet, gbv.violence, -gbv.perpetrated, -gbv.gender_abuser, -gbv.violence_before, -gbv.violence_continued, -gbv.help, -gbv.help_from, -gbv.note_helpline, vaccination.vaccinated, -vaccination.vaccinated_yes, -vaccination.vaccinated_no, vaccination.feelings, vaccination.flu_vaccine, demographics3.income, demographics3.hardship, demographics3.political_opinion, postcode, -meta.instanceID, -KEY, -SubmitterID, -SubmitterName, -AttachmentsPresent, -AttachmentsExpected, -Status ) ######################################################################################################## # Recode Education to make fewer, more informative classes ######################################################################################################## df$dem.education[which(df$dem.education=="do_not_know")]<-NA df$dem.education<-recode(df$dem.education, "alevel" = "school", "further" = "further", "gcse" = "school", "higher" = "higher", "postgrad" = "higher", "primary" = "school", ) df$dem.education<-factor(df$dem.education) ######################################################################################################## # Set as factors for vaccination behaviours ######################################################################################################## df$vaccination.feelings<-factor(df$vaccination.feelings) df$vaccination.flu_vaccine<-factor(df$vaccination.flu_vaccine) ######################################################################################################## # Recode age groups to match previous publication [https://doi.org/10.1371/journal.pone.0239247] ######################################################################################################## df$dem.age2<-NA df$dem.age2[which(df$dem.age < 34)]<-"18-34" df$dem.age2[which(df$dem.age >=35 & df$dem.age < 55)]<-"35-54" df$dem.age2[which(df$dem.age >=55 & df$dem.age < 70)]<-"55-69" df$dem.age2[which(df$dem.age >=70)]<-"70+" df$dem.age<-as.factor(df$dem.age2) df$dem.age<-factor(df$dem.age,levels = c("18-34","35-54","55-69","70+")) df$dem.age2<-NULL ######################################################################################################## # Recode Postcodes to regions ######################################################################################################## df$region [df$postcode=="Blackburn"|df$postcode=="Bradford"|df$postcode=="Bolton"|df$postcode=="Carlisle"|df$postcode=="Chester"|df$postcode=="Crewe"|df$postcode=="Blackpool"| df$postcode=="Huddersfield"|df$postcode=="Halifax"|df$postcode=="Liverpool"|df$postcode=="Lancaster"|df$postcode=="Manchester"|df$postcode=="Oldham"|df$postcode=="Preston"|df$postcode=="Stockport"|df$postcode=="Warrington"|df$postcode=="Wigan"]<-"North West" df$region [df$postcode=="Aberdeen"|df$postcode=="Dundee"|df$postcode=="Dumfries and Galloway"|df$postcode=="Edinburgh"|df$postcode=="Falkirk and Stirling"|df$postcode=="Glasgow"|df$postcode=="Outer Hebrides"|df$postcode=="Inverness"|df$postcode=="Kilmarnock"|df$postcode=="Kirkwall"|df$postcode=="Kirkcaldy"|df$postcode=="Motherwell"|df$postcode=="Paisley"|df$postcode=="Perth"|df$postcode=="Galashiels"|df$postcode=="Lerwick"]<-"Scotland" df$region [df$postcode=="Bath"|df$postcode=="Bournemouth"|df$postcode=="Bristol"|df$postcode=="Dorchester"|df$postcode=="Exeter"|df$postcode=="Gloucester"|df$postcode=="Plymouth"|df$postcode=="Swindon"|df$postcode=="Salisbury"|df$postcode=="Taunton"|df$postcode=="Torquay"|df$postcode=="Truro"]<-"South West" df$region [df$postcode=="Brighton"|df$postcode=="Canterbury"|df$postcode=="Guildford"|df$postcode=="Rochester"|df$postcode=="Milton Keynes"|df$postcode=="Oxford"|df$postcode=="Portsmouth"|df$postcode=="Reading"|df$postcode=="Redhill"|df$postcode=="Slough"|df$postcode=="Southamptom"|df$postcode=="Tonbridge"]<-"South East" df$region [df$postcode=="Cardiff"|df$postcode=="Llandrindod Wells"|df$postcode=="Llandudno"|df$postcode=="Newport"|df$postcode=="Swansea"|df$postcode=="Shrewsbury"]<-"Wales" df$region [df$postcode=="Birmingham"|df$postcode=="Coventry"|df$postcode=="Dudley"|df$postcode=="Hereford"|df$postcode=="Northampton"|df$postcode=="Stoke-on-Trent"|df$postcode=="Telford"|df$postcode=="Worcester"|df$postcode=="Walsall"|df$postcode=="Wolverhampton"]<-"West Midlands" df$region [df$postcode=="Derby"|df$postcode=="Doncaster"|df$postcode=="Leicester"|df$postcode=="Lincoln"|df$postcode=="Nottingham"|df$postcode=="Sheffield"]<-"East Midlands" df$region [df$postcode=="Derby"|df$postcode=="Doncaster"|df$postcode=="Leicester"|df$postcode=="Lincoln"|df$postcode=="Nottingham"|df$postcode=="Sheffield"]<-"East Midlands" df$region [df$postcode=="St Albans"|df$postcode=="Cambridge"|df$postcode=="Chelmsford"|df$postcode=="Colchester"|df$postcode=="Hemel Hempstead"|df$postcode=="Ipswich"|df$postcode=="Luton"|df$postcode=="Norwich"|df$postcode=="Peterborough"|df$postcode=="Stevenage"|df$postcode=="Southend-on-Sea"]<-"East of England" df$region [df$postcode=="Bromley"|df$postcode=="Croydon"|df$postcode=="Dartford"|df$postcode=="East London"|df$postcode=="Central London EC"|df$postcode=="Enfield"|df$postcode=="Harrow"|df$postcode=="Ilford"|df$postcode=="Kingston upon Thames"|df$postcode=="North London"|df$postcode=="North West London"|df$postcode=="Romford"|df$postcode=="South East London"|df$postcode=="Sutton"|df$postcode=="South West London"|df$postcode=="Sutton"|df$postcode=="South West London"|df$postcode=="Twickenham"|df$postcode=="Southall"|df$postcode=="West London"|df$postcode=="Central London WC"|df$postcode=="Watford"]<-"London" df$region [df$postcode=="Durham"|df$postcode=="Darlington"|df$postcode=="Harrogate"|df$postcode=="Hull"|df$postcode=="Leeds"|df$postcode=="Newcastle upon Tyne"|df$postcode=="Sunderland"|df$postcode=="Cleveland"|df$postcode=="Wakefield"|df$postcode=="York"]<-"North East" df$region [df$postcode=="Northern Ireland"]<-"Northern Ireland" df$region [df$postcode=="British Forces"]<-"British Forces" df$region<-as.character(df$region) df$region[which(is.na(df$region))]<-"Unknown" df$region [df$postcode=="British Forces"]<-NA df$region<-factor(df$region, levels = c("London","East Midlands","East of England", "North East", "North West", "Northern Ireland", "Scotland", "South East", "South West", "Wales", "West Midlands", "Unknown")) df$postcode<-NULL ######################################################################################################## # As factor employment and employment changes ######################################################################################################## df$dem.employment<-factor(df$dem.employment,levels = c("fulltime","parttime","retired","student","homemaker","unemployed")) df$dem.employment_changed<-factor(df$dem.employment_changed,levels=c("no_change","retired","furlough__returned","furlough_current","lost_job","new_job","something_else")) ######################################################################################################## # Recode Ethnicity ######################################################################################################## df$eth.ethnicity<-recode(df$eth.ethnicity, "white" = "White", "asian" = "Asian", "black" = "Black", "mixed" = "Mixed", "other" = "Another Ethnic Group" ) df$eth.ethnicity<-factor(df$eth.ethnicity,levels=c("White","Asian","Black","Mixed","Another Ethnic Group")) ######################################################################################################## # Recode Incomes ######################################################################################################## df$demographics3.income<-as.character(df$demographics3.income) df$demographics3.income[which(df$demographics3.income=="-9")]<-NA df$demographics3.income<-factor(df$demographics3.income) df$demographics3.income<-recode(df$demographics3.income, "1"="Less than £15,000", "2"="£15,000 - £24,999", "3"="£25,000 - £39,999", "4"="£40,000 - £59,999", "5"="£60,000 - £99,999", "6"="More than £100,000" ) ######################################################################################################## # Recode change job data ######################################################################################################## df$dem.employment_changed<-recode(df$dem.employment_changed, "no_change" = "No change", "retired" = "Retired", "furlough__returned" = "Furlough (current or previous)", "furlough_current" = "Furlough (current or previous)", "lost_job" = "Lost job", "new_job" = "New job", "something_else" = "Other employment changes" ) ######################################################################################################## # Recode covid symptoms data ######################################################################################################## df$covid.symptoms<-recode(df$covid.symptoms, "me" = "Yes", "me_and_others" = "Yes", "member" = "Yes", "none" = "No" ) df$covid.symptoms<-factor(df$covid.symptoms,levels=c("No","Yes")) ######################################################################################################## # Recode trust ######################################################################################################## df$trustgov[which(df$trustgov=="donotknow")]<-NA df$trustgov<-factor(df$trustgov,levels=c("mostly","always","sometimes","almost_never","never")) ######################################################################################################## #Recode marital stats ######################################################################################################## df$dem.marital<-recode(df$dem.marital, "married" = "Married/Partnership", "single" = "Single", "partnership" = "Married/Partnership", "Other" = "Other arrangement" ) ######################################################################################################## # Recode ethnicity ######################################################################################################## df$eth.ethnicity<-recode(df$eth.ethnicity, "White"="White", "Asian"="Ethnic Minority", "Black"="Ethnic Minority", "Mixed"="Ethnic Minority", "Another Ethnic Group"="Ethnic Minority" ) df$trust.trust_level<-factor(df$trust.trust_level,levels=c("same","increased","decreased")) ######################################################################################################## # Create descriptive table in markdown and html ######################################################################################################## print(dfSummary(df,na.col = T),file="01_raw_data.html") tab1 <- dfSummary(df, max.distinct.values = 5, style = "grid",graph.col = F) summarytools::view(tab1, file = "tab1.md") summarytools::view(tab1, file = "tab1.html") tab1 ######################################################################################################## # Prepare data for imputation and set seed for reproducible analysis ######################################################################################################## df.for.imp<-df set.seed(21345) ######################################################################################################## # Read in ancillary function that makes parallel implementation of mice work properly on modern OS X # (may not be needed in future releases of R but fixes a bug in mice package in early 2021) ######################################################################################################## source("parlmice_commands.R") ######################################################################################################## # PARALLEL IMPUTATION # Ensure that you set n.core to a sensible number based on CPU availability on your system. If in doubt, use 4. ######################################################################################################## df.imp<-parlmice2(df.for.imp,maxit=50,meth='pmm',cluster.seed = 500,n.core = 15,n.imp.core = 1,cl.type = "FORK") ######################################################################################################## # Merge imputations (with library sjmisc) using modal values for factors | set seed for reproduction ######################################################################################################## set.seed(21345) df.completion<-merge_imputations(df.imp$data, df.imp) ######################################################################################################## # Save a copy to avoid having to do this stuff again and again ######################################################################################################## saveRDS(object = df.imp,file = "RAA2_imputed.data.vaccine.all.RDS") saveRDS(object = df.completion,file = "RAA2_imputed.data.vaccine.single.RDS") ######################################################################################################## # Read back the data just saved, creating a start point for the second phase of analysis ######################################################################################################## set.seed(21345) df.completion<-readRDS("RAA2_imputed.data.vaccine.single.RDS") ######################################################################################################## # Remove some variables that won't be used in this analysis ######################################################################################################## df.completion<-select(df.completion, -dem.people_in_home, -dem.marital , -dem.school_age_kids , -dem.employment_changed, -trust.trust_level , -depression.depressed_before , -phys_activity.phys_activity_before, -phys_activity.phys_activity_recent, -weight_describe.weight_before , -smoking.smoking_before, -weight_describe.weight_recent , -alcohol_1.alcohol_before , -alcohol_1.alcohol_recent , -sleep_hours.sleep_before , -sleep_hours.sleep_after , -adls.dressing , -adls.walking , -adls.bathing , -adls.eating , -adls.bed , -adls.toilet , -gbv.violence ) ######################################################################################################## # Recode various variables to make data more descriptive and set sensible reference groups ######################################################################################################## df.completion$vaccination.vaccinated<-recode(df.completion$vaccination.vaccinated, "no" = "Reject_Vaccine", "yes" = "Accept_Vaccine") df.completion$vaccination.feelings<-recode(df.completion$vaccination.feelings, "0" = "No Vaccine concerns", "1" = "Vaccine concerns", "2" = "Vaccine concerns & Rejected vaccine offers") df.completion$vaccination.flu_vaccine<-recode(df.completion$vaccination.flu_vaccine, "0" = "Does not get flu vaccine", "1" = "Sometimes gets flu vaccine", "2" = "Always gets flu vaccine") df.completion$vaccination.vaccinated<-relevel(df.completion$vaccination.vaccinated,ref = "Accept_Vaccine") df.completion$demographics3.income<-factor(df.completion$demographics3.income,levels=c( "Less than £15,000", "£15,000 - £24,999", "£25,000 - £39,999", "£40,000 - £59,999", "£60,000 - £99,999", "More than £100,000" )) df.completion$demographics3.political_opinion<-factor(df.completion$demographics3.political_opinion,levels=c( "floating", "conservative", "liberal" )) df.completion$vaccination.flu_vaccine<-factor(df.completion$vaccination.flu_vaccine,levels=c("Always gets flu vaccine","Sometimes gets flu vaccine","Does not get flu vaccine")) df.completion$smoking.smoking_recent<-factor(df.completion$smoking.smoking_recent,levels=c("non_smoker","light_smoker","moderate_smoker","heavy_smoker")) df.completion$depression.depressed_recent<-factor(df.completion$depression.depressed_recent,levels=c("not_at_all" , "several_days", "half_the_days","every_day" )) df.completion$trustgov<-factor(df.completion$trustgov,levels=c("always","mostly","sometimes","almost_never","never")) df.completion$eth.ethnicity<-recode(df.completion$eth.ethnicity, "Black/Asian/Mixed/Other Ethnicity"="Ethnic Minority", "White" = "White") df.completion$gov_dec.gov_right_decision<-relevel(df.completion$gov_dec.gov_right_decision,ref = "yes") df.completion$health_overall<-factor(df.completion$health_overall,levels=c("very_good","good","fair","bad","very_bad")) ######################################################################################################## # Make second set of descriptive tables (i.e. with imputed data) ######################################################################################################## tab1b <- dfSummary(df.completion, max.distinct.values = 5, style = "grid",graph.col = F,) summarytools::view(tab1b, file = "tab1b.md") summarytools::view(tab1b, file = "tab1b.html") tab1b ######################################################################################################## # table 2 - differences after covid19 ######################################################################################################## tab2 <- tableby(vaccination.vaccinated ~ ., data=df.completion,cat.stats = c("countrowpct")) tab2summary<-as.data.frame(summary(tab2)) names(tab2summary)[1]<-"factor" tab2summary<-tab2summary[,c(1,4,2,3)] tab2b <- tableby(vaccination.vaccinated ~ ., data=df.completion,cat.stats = c("countpct")) tab2bsummary<-as.data.frame(summary(tab2b)) names(tab2bsummary)[1]<-"factor" tab2summary$`Total (N=4535)`<-tab2bsummary$`Total (N=4535)` tab2summary<-flextable(tab2summary) save_as_html(tab2summary, path = "tab2.html") ######################################################################################################## # Univariate Analysis ######################################################################################################## gender<-coef(summary(glm(data = df.completion,formula = vaccination.vaccinated~dem.gender,family="binomial"))) age<-coef(summary(glm(data = df.completion,formula = vaccination.vaccinated~dem.age,family="binomial"))) set.seed(21345) univariate_results<-df.completion %>% tbl_uvregression( method = glm, y = vaccination.vaccinated, method.args = list(family = binomial), exponentiate = TRUE, pvalue_fun = ~style_pvalue(.x, digits = 2) ) %>% add_global_p() %>% # add global p-value add_nevent() %>% # add number of events of the outcome add_q() %>% # adjusts global p-values for multiple testing bold_p() %>% # bold p-values under a given threshold (default 0.05) bold_p(t = 0.10, q = TRUE) %>% # now bold q-values under the threshold of 0.10 bold_labels() univariate_results<-univariate_results %>%as_flex_table() save_as_html(univariate_results, path = "tab3.html") ######################################################################################################## # Multivariate Analysis ######################################################################################################## set.seed(21345) df.completion.multivariate<-select(df.completion, dem.gender, dem.age, dem.education, dem.employment, dem.disabled, eth.ethnicity, covid.symptoms, gov_dec.gov_right_decision, trustgov, health_overall, depression.depressed_recent, smoking.smoking_recent, specific_conditions.hypertension, specific_conditions.obesity, vaccination.feelings, vaccination.flu_vaccine, demographics3.income, demographics3.hardship, demographics3.political_opinion, vaccination.vaccinated ) m1 <- glm(vaccination.vaccinated ~ ., data = df.completion.multivariate, family = binomial) summary(m1)$coefficients table4<-tbl_regression(m1, exponentiate = TRUE) multivariate_results<-table4 %>%as_flex_table() multivariate_results save_as_html(multivariate_results, path = "tab4.html") ######################################################################################################## # Calculate 95% CIs ######################################################################################################## test<-summary(m1) test<-as.data.frame(coefficients(test)) test$OR<-exp(as.numeric(test$Estimate)) CIs<-confint(m1) CIs<-exp(CIs) test<-cbind(test,CIs) test$variable<-rownames(test) ######################################################################################################## # Checking the model ######################################################################################################## model.summary<-test model.summary<-select(model.summary,variable,OR,"2.5 %","97.5 %","Pr(>|z|)") ######################################################################################################## # Add some reference points ######################################################################################################## model.summary$variable<-as.character(model.summary$variable) model.summary[nrow(model.summary)+1,]<-c("dem.genderFemale (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("dem.age18-34 (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("dem.educationschool (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("dem.employmentfulltime (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("eth.ethnicityWhite (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("trustgovalways (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("depression.depressed_recentnot_at_all (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("smoking.smoking_recentnon_smoker (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("demographics3.incomeLess than £15,000 (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("demographics3.political_opinioncentre (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("vaccination.flu_vaccineAlways gets flu vaccine (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("vaccination.feelingsNo Vaccine concerns (ref)",1,1,1,1) model.summary[nrow(model.summary)+1,]<-c("healthoverallvery_good (ref)",1,1,1,1) ######################################################################################################## # Remap variables so that they print nicely ######################################################################################################## model.summary$variable<-factor(model.summary$variable) model.summary.plot<-model.summary model.summary.plot<-arrange(model.summary,variable) model.summary.plot$OR<-as.numeric(model.summary.plot$OR) model.summary.plot$`2.5 %`<-as.numeric(model.summary.plot$`2.5 %`) model.summary.plot$`97.5 %`<-as.numeric(model.summary.plot$`97.5 %`) ######################################################################################################## # Create the ggplot of Odds Ratios for rejecting vaccine ######################################################################################################## ggplot(model.summary.plot,aes(x = fct_rev(variable),y=OR))+ geom_point(aes(),position = position_dodge(width=0.5))+theme_bw()+ theme(axis.text=element_text(size=10), axis.title=element_text(size=10), axis.text.x = element_text(angle = 90, hjust = 1), plot.title = element_text(size=10), axis.title.y=element_blank(), legend.position="none" )+ geom_hline(yintercept=1,lty=2,col="Red")+ scale_y_log10(limits=c(0.1,40),labels = c(0.01,0.1,0.25,0.5,1,2,5,10,20,40),breaks = c(0.01,0.1,0.25,0.5,1,2,5,10,20,40))+ geom_errorbar(aes(x=variable, ymax=`97.5 %`, ymin=`2.5 %`), na.rm=TRUE, position=position_dodge(),width=0.3,lty=1)+ coord_flip() ggsave("fig1.tiff") ggsave("fig1.png") ggsave("fig1.pdf") ######################################################################################################## # Prepare for STM analysis by binding in free text ######################################################################################################## df<-read.csv("COVID19_V2.csv", na.strings = c("NA",""),stringsAsFactors = T) df$SubmissionDate<-as.Date(df$SubmissionDate) df<-filter(df,SubmissionDate>"2020-12-08") df<-select(df, vaccination.vaccinated_yes, vaccination.vaccinated_no ) df.for_stm<-bind_cols(df.completion.multivariate,df) ######################################################################################################## # Make df for each group, only include people who gave an answer to the free text question ######################################################################################################## df.stm.reject<-df.for_stm[which(df.for_stm$vaccination.vaccinated=="Reject_Vaccine"),] df.stm.reject<-select(df.stm.reject,-vaccination.vaccinated_yes) df.stm.accept<-df.for_stm[which(df.for_stm$vaccination.vaccinated=="Accept_Vaccine"),] df.stm.accept<-select(df.stm.accept,-vaccination.vaccinated_no) ######################################################################################################## ######################################################################################################## # Run STM on group who would reject vaccine ######################################################################################################## ######################################################################################################## ######################################################################################################## # Create corpus ######################################################################################################## { set.seed(21345) processed <- textProcessor(df.stm.reject$vaccination.vaccinated_no, metadata = df.stm.reject,stem = FALSE,customstopwords = c("dont","etc","don't","things","also","can","will","don’t","way","made","just","one","seem","much","avoid","feel","need","else","one","vaccines","yet","want","vaccination","vaccine","vaccinations","vaccinated","covid")) out <- prepDocuments(processed$documents, processed$vocab, processed$meta) docs <- out$documents vocab <- out$vocab meta <-out$meta } ######################################################################################################## # find best number of topics ######################################################################################################## { set.seed(21345) storage <- searchK(out$documents, out$vocab, K = c(2:10,15,20,25,30,35,40), data = meta,cores = 15) } ######################################################################################################## # add a chart of results based on this - [https://juliasilge.com/blog/evaluating-stm/](https://juliasilge.com/blog/evaluating-stm/) ######################################################################################################## pdf("vaccine_reject_searchK.pdf",useDingbats = F) plot(storage) dev.off() ######################################################################################################## # choose a number of topics that maintains high semantic coherence whilst maximising value of the held-out likelihood and minimising residuals # run the stm with the optimal number of topics (add more covariates that could affect prior probabilty of topics being discussed by using the prevalence command. Add more using covariates=~ dem.age2 + dem.gender + ...) ######################################################################################################## { set.seed(21345) fit <- stm(documents = out$documents, vocab = out$vocab,K = 20,max.em.its = 1500, data = out$meta,init.type = "Spectral") } ######################################################################################################## #ANALYSIS AND VISUALISATION (COMMANDS IN PARENTHESIS) # Displaying words associated with topics (labelTopics, plot.STM(,type = "labels"), sageLabels, plot.STM(,type = "perspectives")) or documents highly associated with particular topics (findThoughts, plotQuote). # Make a data table of the metadata topic proportions ######################################################################################################## topicprop<-make.dt(fit, meta) ######################################################################################################## # Plot the map estimates of document-topic proportions ######################################################################################################## plot.STM(fit, "hist") ######################################################################################################## # plot a summary of the topics, with 5 top words ######################################################################################################## plot.STM(fit, "summary", n=5)# distribution and top 5 words per topic ######################################################################################################## # Also show the frequencies of the key words between different topics ######################################################################################################## td_beta <- tidytext::tidy(fit) options(repr.plot.width=7, repr.plot.height=8, repr.plot.res=100) td_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% mutate(topic = paste0("Topic ", topic), term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = as.factor(topic))) + geom_col(alpha = 0.8, show.legend = FALSE) + theme_bw() + facet_wrap(~ topic, scales = "free_y") + coord_flip() + scale_x_reordered() + labs(x = NULL, y = expression(beta), title = "Highest word probabilities for each topic for those who would not accept a COVID-19 vaccine", subtitle = "Different words are associated with different topics") ggsave("vaccine_reject_word_content.pdf") ######################################################################################################## # you can also show detailed wordlist for specific topic # would be good to go through each topic and identify excess stopwords and add to the custom list above # at this stage you might want to remove words that are not semantically coherent. Go back and add some custom stopwords to the stm function # may have to iterate on this process # The theta object in the stm model output has the posterior probability of a topic given a document that this function uses. i.e. theta is the probability that a response belongs to a specific topic # if wanted, plot out a selection (here 1:100) of documents and look at how theta patterns = theta is the probability of a document being in that topic # if you don't have fairly strong theta for one or two topics, maybe reduce number of topics to enhance semantic coherence ######################################################################################################## ######################################################################################################## # Plot theta values for each topic ######################################################################################################## td_theta <- tidytext::tidy(fit, matrix = "theta") selectiontdthteta<-td_theta[td_theta$document%in%c(1:4000),]#select the first 30 documents. be careful to select a sensible interval, as attempting to load a very huge corpus might crash the kernel thetaplot1<-ggplot(selectiontdthteta, aes(y=gamma, x=document, fill = as.factor(topic))) + geom_bar(stat="identity",alpha = 0.8, show.legend = FALSE) + facet_wrap(~ topic, ncol = 3) + labs(title = "Theta values per document", y = expression(theta), x = "Topic") thetaplot1 ggsave("vaccine_reject_theta_scores.pdf") ######################################################################################################## # worth considering if any of the topics have few documents that have high theta values. For instance here, topic 5 and topic 7 seem less focussed in terms of theta # To examine documents that are highly representative of topics the findThoughts function # This function will print the documents highly associated with each topic ######################################################################################################## thoughts<-findThoughts(model = fit,texts = as.character(meta$vaccination.vaccinated_no), n=100, topics = 1:20,thresh=0.3)$docs #write.csv(thoughts,"thoughts_reject.csv",quote=T) write.csv(thoughts$`Topic 1`,"thoughts_reject_A.csv",quote = T) write.csv(thoughts$`Topic 2`,"thoughts_reject_Bi.csv",quote = T) write.csv(thoughts$`Topic 3`,"thoughts_reject_C.csv",quote = T) write.csv(thoughts$`Topic 4`,"thoughts_reject_D.csv",quote = T) write.csv(thoughts$`Topic 5`,"thoughts_reject_E.csv",quote = T) write.csv(thoughts$`Topic 6`,"thoughts_reject_F.csv",quote = T) write.csv(thoughts$`Topic 7`,"thoughts_reject_G.csv",quote = T) write.csv(thoughts$`Topic 8`,"thoughts_reject_Bii.csv",quote = T) write.csv(thoughts$`Topic 9`,"thoughts_reject_Hi.csv",quote = T) write.csv(thoughts$`Topic 10`,"thoughts_reject_I.csv",quote = T) write.csv(thoughts$`Topic 11`,"thoughts_reject_J.csv",quote = T) write.csv(thoughts$`Topic 12`,"thoughts_reject_Biii.csv",quote = T) write.csv(thoughts$`Topic 13`,"thoughts_reject_K.csv",quote = T) write.csv(thoughts$`Topic 14`,"thoughts_reject_L.csv",quote = T) write.csv(thoughts$`Topic 15`,"thoughts_reject_Biv.csv",quote = T) write.csv(thoughts$`Topic 16`,"thoughts_reject_M.csv",quote = T) write.csv(thoughts$`Topic 17`,"thoughts_reject_N.csv",quote = T) write.csv(thoughts$`Topic 18`,"thoughts_reject_Hii.csv",quote = T) write.csv(thoughts$`Topic 19`,"thoughts_reject_O.csv",quote = T) write.csv(thoughts$`Topic 20`,"thoughts_reject_P.csv",quote = T) ######################################################################################################## # Plot Correlation chart ######################################################################################################## fit.corr<- topicCorr(fit,cutoff = 0.25) pdf("thoughts_reject_correlations.pdf",useDingbats = F) plot(fit.corr) dev.off() plot(fit.corr) ######################################################################################################## # Make table of topics and add titles (Manual process based on reading quotes) ######################################################################################################## reject.topics<-1:20 reject.topics<-as.data.frame(reject.topics) names(reject.topics)<-"Assignment" reject.topics$Topic<-NA reject.topics$Title<-NA reject.topics$Topic<-c("A","Bi","C","D","E","F","G","Bii","H","I","J","Biii","K","L","Biv","M","N","Hii","O","P") reject.topics$n.0.3<-c(25,6,11,18,21,10,19,21,6,18,25,22,13,29,12,24,9,16,12,4) reject.topics$Title[1]<-"Healthy, so can rely on own immune system" reject.topics$Title[2]<-"No trust in government, pharma industry and vaccine" reject.topics$Title[3]<-"Concerns about unknown safety and effectiveness" reject.topics$Title[4]<-"Lack of need for COVID-19 vaccine given low overall mortality rate" reject.topics$Title[5]<-"Undecided and/or concerned about side-effects and personal medical history" reject.topics$Title[6]<-"Risks of relatively untested vaccine Vs benefits of vaccinating generally healthy people" reject.topics$Title[7]<-"Contextualises COVID-19 vaccines in personal history of flu vaccines and adverse reactions" reject.topics$Title[8]<-"Mistrust of pharma and government in context of rapid development of vaccines" reject.topics$Title[9]<-"Concerns about side effects" reject.topics$Title[10]<-"Previous bad reactions to vaccines and/or allergies to penicillin" reject.topics$Title[11]<-"Concerned about quality of research during rapid vaccine development" reject.topics$Title[12]<-"Vaccines have not been tested enough" reject.topics$Title[13]<-"Pregnant, breastfeeding, ineligible or unwilling to be vaccinated" reject.topics$Title[14]<-"Potentially willing to be vaccinated, but expressed hesitancy or concerns" reject.topics$Title[15]<-"Not enough testing, specific concerns of adverse reactions and unforeseen complications" reject.topics$Title[16]<-"Allergies and reactions to medicines" reject.topics$Title[17]<-"References to previously failed trials, legal cases and medical scandals" reject.topics$Title[18]<-"Concerns about long term side effects" reject.topics$Title[19]<-"Ethical or medical oncerns about derivation, formulation and effects of vaccine" reject.topics$Title[20]<-"[Fewer than five quotes with theta > 0.3]" reject.topics<-arrange(reject.topics,Topic) reject.topics<-flextable(reject.topics) save_as_html(reject.topics, path = "tab5.html") ######################################################################################################## ######################################################################################################## # RUN STM on subset that planned to accept vaccine ######################################################################################################## ######################################################################################################## ######################################################################################################## # Create corpus ######################################################################################################## { set.seed(21345) processed <- textProcessor(df.stm.accept$vaccination.vaccinated_yes, metadata = df.stm.accept,stem = FALSE,customstopwords = c("dont","etc","don't","things","also","can","will","don’t","way","made","just","one","seem","much","avoid","feel","need","else","one","vaccines","yet","want","vaccination","vaccine","vaccinations","vaccinated","covid")) out <- prepDocuments(processed$documents, processed$vocab, processed$meta) docs <- out$documents vocab <- out$vocab meta <-out$meta } ######################################################################################################## # Find best number of topics ######################################################################################################## { set.seed(21345) storage2 <- searchK(out$documents, out$vocab, K = c(2:10,15,20,25,30,35,40), data = meta,cores = 15) } ######################################################################################################## # Add a chart of results based on this - [https://juliasilge.com/blog/evaluating-stm/](https://juliasilge.com/blog/evaluating-stm/) ######################################################################################################## plot(storage2) pdf("trust_level_change_searchK.pdf",useDingbats = F) plot(storage2) dev.off() ######################################################################################################## # choose a number of topics that maintains high semantic coherence whilst maximising value of the held-out likelihood and minimising residuals # run the stm with the optimal number of topics (add more covariates that could affect prior probabilty of topics being discussed by using the prevalence command. Add more using covariates=~ dem.age2 + dem.gender + ...) ######################################################################################################## { set.seed(21345) fit2 <- stm(documents = out$documents, vocab = out$vocab,K = 25,max.em.its = 1500, data = out$meta,init.type = "Spectral") } ######################################################################################################## #ANALYSIS AND VISUALISATION (COMMANDS IN PARENTHESIS) # Displaying words associated with topics (labelTopics, plot.STM(,type = "labels"), sageLabels, plot.STM(,type = "perspectives")) or documents highly associated with particular topics (findThoughts, plotQuote). # Show words associated with topics # After reading the thoughts document you can update this with a list of text like c("T1 : Stuff","T2 : Other Stuff") ######################################################################################################## ######################################################################################################## # Make a data table of the metadata topic proportions ######################################################################################################## topicprop2<-make.dt(fit2, meta) ######################################################################################################## # Plot the map estimates of document-topic proportions ######################################################################################################## plot.STM(fit2, "hist") ######################################################################################################## # Plot a summary of the topics, with 5 top words ######################################################################################################## plot.STM(fit2, "summary", n=5)# distribution and top 5 words per topic ######################################################################################################## # Also show the frequencies of the key words between different topics ######################################################################################################## td_beta <- tidytext::tidy(fit2) options(repr.plot.width=7, repr.plot.height=8, repr.plot.res=100) td_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% mutate(topic = paste0("Topic ", topic), term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = as.factor(topic))) + geom_col(alpha = 0.8, show.legend = FALSE) + theme_bw() + facet_wrap(~ topic, scales = "free_y") + coord_flip() + scale_x_reordered() + labs(x = NULL, y = expression(beta), title = "Highest word probabilities for each topic for those who would accept a COVID-19 vaccine", subtitle = "Different words are associated with different topics") ggsave("vaccine_accept_word_content.pdf") ######################################################################################################## # you can also show detailed wordlist for specific topic # would be good to go through each topic and identify excess stopwords and add to the custom list above # at this stage you might want to remove words that are not semantically coherent. Go back and add some custom stopwords to the stm function # may have to iterate on this process # The theta object in the stm model output has the posterior probability of a topic given a document that this function uses. i.e. theta is the probability that a response belongs to a specific topic # if wanted, plot out a selection (here 1:100) of documents and look at how theta patterns = theta is the probability of a document being in that topic # if you don't have fairly strong theta for one or two topics, maybe reduce number of topics to enhance semantic coherence ######################################################################################################## ######################################################################################################## # Plot the theta values for each topic ######################################################################################################## td_theta <- tidytext::tidy(fit2, matrix = "theta") selectiontdthteta<-td_theta[td_theta$document%in%c(1:4000),]#select the first 30 documents. be careful to select a sensible interval, as attempting to load a very huge corpus might crash the kernel thetaplot1<-ggplot(selectiontdthteta, aes(y=gamma, x=document, fill = as.factor(topic))) + geom_bar(stat="identity",alpha = 0.8, show.legend = FALSE) + facet_wrap(~ topic, ncol = 3) + labs(title = "Theta values per document", y = expression(theta), x = "Topic") thetaplot1 ggsave("vaccine_accept_theta_scores.pdf") ######################################################################################################## # Worth considering if any of the topics have few documents that have high theta values. For instance here, topic 5 and topic 7 seem less focussed in terms of theta # To examine documents that are highly representative of topics the findThoughts function # This function will print the documents highly associated with each topic ######################################################################################################## thoughts<-findThoughts(model = fit2,texts = as.character(meta$vaccination.vaccinated_yes), n=100, topics = 1:25,thresh=0.3)$docs write.csv(thoughts$`Topic 1`,"thoughts_accept_T01.csv",quote = T) write.csv(thoughts$`Topic 2`,"thoughts_accept_T02i.csv",quote = T) write.csv(thoughts$`Topic 3`,"thoughts_accept_T03.csv",quote = T) write.csv(thoughts$`Topic 4`,"thoughts_accept_T04i.csv",quote = T) write.csv(thoughts$`Topic 5`,"thoughts_accept_T05i.csv",quote = T) write.csv(thoughts$`Topic 6`,"thoughts_accept_T06.csv",quote = T) write.csv(thoughts$`Topic 7`,"thoughts_accept_T07.csv",quote = T) write.csv(thoughts$`Topic 8`,"thoughts_accept_T05ii.csv",quote = T) write.csv(thoughts$`Topic 9`,"thoughts_accept_T04ii.csv",quote = T) write.csv(thoughts$`Topic 10`,"thoughts_accept_T08.csv",quote = T) write.csv(thoughts$`Topic 11`,"thoughts_accept_T04iii.csv",quote = T) write.csv(thoughts$`Topic 12`,"thoughts_accept_T04iv.csv",quote = T) write.csv(thoughts$`Topic 13`,"thoughts_accept_T09.csv",quote = T) write.csv(thoughts$`Topic 14`,"thoughts_accept_T05iii.csv",quote = T) write.csv(thoughts$`Topic 15`,"thoughts_accept_T04v.csv",quote = T) write.csv(thoughts$`Topic 16`,"thoughts_accept_T10.csv",quote = T) write.csv(thoughts$`Topic 17`,"thoughts_accept_T11.csv",quote = T) write.csv(thoughts$`Topic 18`,"thoughts_accept_T12.csv",quote = T) write.csv(thoughts$`Topic 19`,"thoughts_accept_T05iv.csv",quote = T) write.csv(thoughts$`Topic 20`,"thoughts_accept_T13.csv",quote = T) write.csv(thoughts$`Topic 21`,"thoughts_accept_T14.csv",quote = T) write.csv(thoughts$`Topic 22`,"thoughts_accept_T15.csv",quote = T) write.csv(thoughts$`Topic 23`,"thoughts_accept_T02ii.csv",quote = T) write.csv(thoughts$`Topic 24`,"thoughts_accept_T16.csv",quote = T) write.csv(thoughts$`Topic 25`,"thoughts_accept_T05v.csv",quote = T) ######################################################################################################## # Plot the correlation chart ######################################################################################################## fit.corr2<- topicCorr(fit2,cutoff = 0.25) plot(fit.corr2) pdf("thoughts_accept_correlations.pdf",useDingbats = F) plot(fit.corr2) dev.off() ######################################################################################################## # Make table of topics and add titles (manual process based on reading quotes) ######################################################################################################## accept.topics<-1:25 accept.topics<-as.data.frame(accept.topics) names(accept.topics)<-"Assignment" accept.topics$Topic<-NA accept.topics$Title<-NA accept.topics$Topic<-c("1","2i","3","4i","5i","6","7","5ii","4ii","8","4iii","4iv","9","5iii","4v","10","11","12","5iv","13","14","15","2ii","16","5v") accept.topics$n.0.3<-c(37,40,39,55,0,52,13,0,7,58,1,7,27,2,0,20,17,34,8,54,10,13,1,72,1) accept.topics$Title[1]<-"Recognises value of vaccinations and compares COVID-19 to flu vaccines" accept.topics$Title[2]<-"Wants to get back to normal life" accept.topics$Title[3]<-"Aware of risks and concerns of others, but on balance sees benefit of COVID-19 vaccines" accept.topics$Title[4]<-"Wants to protect loved ones / vulnerable people, and travel to see friends and family" accept.topics$Title[5]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[6]<-"Trusts the science, scientists and the scientific process" accept.topics$Title[7]<-"Has some concerns, but on balance believes acceptance the correct choice" accept.topics$Title[8]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[9]<-"Protect self and others" accept.topics$Title[10]<-"To help build herd immunity" accept.topics$Title[11]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[12]<-"Wishes to mix in social, cultural and work contexts" accept.topics$Title[13]<-"Recognises vaccination as a social responsibility and public good" accept.topics$Title[14]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[15]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[16]<-"Healthcare professionals and/or visit care homes" accept.topics$Title[17]<-"Responsibilities to the wider community" accept.topics$Title[18]<-"Works in NHS, Schools, Pharma and with vulnerable people" accept.topics$Title[19]<-"To prevent contracting and spreading infection" accept.topics$Title[20]<-"Participant and/or family member in high risk group because of age or medical condition" accept.topics$Title[21]<-"Vaccination to be safe and make others safe, some with negative sentiments on UK government" accept.topics$Title[22]<-"Prevention better than cure, referencing success of historical vaccine programmes" accept.topics$Title[23]<-"[Fewer than five quotes with theta > 0.3]" accept.topics$Title[24]<-"It is the sensible and responsible thing to do" accept.topics$Title[25]<-"[Fewer than five quotes with theta > 0.3]" accept.topics<-arrange(accept.topics,Topic) accept.topics<-flextable(accept.topics) save_as_html(accept.topics, path = "tab6.html") accept.topics ############################################################################## ############################################################################## # END OF ANALYSIS ##############################################################################