attach(TBdataset) #libraries library(moments) library(dplyr) library(PropCIs) library(broom) library(sandwich) library(car) library(lmtest) #descriptive statistics: sample characteristics stratified by TB status summary(microTB_status) table(microTB_status, useNA = "always") TBstatus_province<-table(study_site,microTB_status,useNA="always") addmargins(TBstatus_province) TBstatus_sex<-table(h_sex,microTB_status,useNA="always") addmargins(TBstatus_sex) age_hist <- hist(h_age,main = "", xlab = "Age",col="lightblue1",breaks=20) shapiro.test(h_age) skewness(h_age,na.rm = TRUE) kurtosis(h_age,na.rm = TRUE) summary(h_age) median(h_age) IQR(h_age) TBdataset%>% group_by(microTB_status)%>% summarise(Median=median(h_age),quantile=quantile(h_age)) TBstatus_hiv<-table(h_hiv,microTB_status,useNA="always") addmargins(TBstatus_hiv) TBstatus_ART<-table(h_art,microTB_status,useNA="always") addmargins(TBstatus_ART) TBstatus_MissingART<-table(microTB_status,HIVpos_missingARTstatus,useNA="always") addmargins(TBstatus_MissingART) TBstatus_DM<-table(h_dm,microTB_status,useNA="always") addmargins(TBstatus_DM) TBstatus_smk<-table(h_smk,microTB_status,useNA="always") addmargins(TBstatus_smk) TBstatus_alc<-table(h_alc,microTB_status,useNA="always") addmargins(TBstatus_alc) TBstatus_h_agegrp<-table(h_agegrp,microTB_status,useNA="always") addmargins(TBstatus_h_agegrp) cd4 <- as.numeric(cd4) cd4_hist <- hist(cd4,main = "", xlab = "CD4 count",col="lightblue1",xlim=c(0,1200),breaks=12) shapiro.test(cd4) skewness(cd4,na.rm = TRUE) kurtosis(cd4,na.rm = TRUE) summary(cd4) summary(cd4_cat) TBstatus_cd4_cat<-table(cd4_cat,microTB_status,useNA="always") addmargins(TBstatus_cd4_cat) table(employment) TBstatus_employment<-table(employment,microTB_status,useNA="always") addmargins(TBstatus_employment) #create a new version of the dataset where only one record kept per household Household_data_TBresult = TBdataset[!duplicated(TBdataset$hhid),] #attach correct dataset remove(TBdataset) attach(Household_data_TBresult) #descriptive statistics household size summary(hhmembers) sd(hhmembers) Household_data_TBresult%>% group_by(microTB_status)%>% summarise(mean(hhmembers), sd(hhmembers)) #calculate prevalence of subclinical TB overall and in each province #with 95% CIs by binomial exact method (Clopper-Pearson) attach(TBdataset) remove(Household_data_TBresult) #overall TB prevalence PropCIs::exactci(68, 2077, 0.95) #proportion scTB/total TB cases PropCIs::exactci(48, 68, 0.95) #overall prevalence scTB PropCIs::exactci(48, 2077, 0.95) #Mangaung prevalence scTB PropCIs::exactci(28, 1190, 0.95) #Capricorn prevalence scTB PropCIs::exactci(20, 887, 0.95) #Overall prevalence active TB PropCIs::exactci(20, 2077, 0.95) #Mangaung prevalence active TB PropCIs::exactci(13, 1190, 0.95) #Capricorn prevalence active TB PropCIs::exactci(7, 887, 0.95) #compare prevalence scTB in Mangaung and Capricorn using two proportions z-test prop.test(x=c(28,20), n=c(1190,887), correct=FALSE) #compare prevalence active TB in Mangaung and Capricorn using two proportions z-test prop.test(x=c(13,7), n=c(1190,887), correct=FALSE) #prevalence calcs in HIV pos only table(h_hiv, microTB_status, useNA = "always") #HIV pos prevalence active TB PropCIs::exactci(10, 377, 0.95) #HIV pos prevalence scTB PropCIs::exactci(15, 377, 0.95) #HIV neg prev active TB PropCIs::exactci(10, 1696, 0.95) #HIV neg prev scTB PropCIs::exactci(33, 1696, 0.95) #compare prevalence scTB in HIV pos vs HIV neg using two proportions z-test prop.test(x=c(15,33), n=c(377,1696), correct=FALSE) #compare prevalence active TB in HIV pos vs HIV neg using two proportions z-test prop.test(x=c(10,10), n=c(377,1696), correct=FALSE) #men prevalence scTB PropCIs::exactci(12, 729, 0.95) #men prevalence active TB PropCIs::exactci(7, 729, 0.95) #women prevalence scTB PropCIs::exactci(36, 1348, 0.95) #women prevalence active TB PropCIs::exactci(13, 1348, 0.95) #compare prevalence scTB in men vs women using two proportions z-test prop.test(x=c(12,729), n=c(36,1348), correct=FALSE) #compare prevalence active TB in men vs women using two proportions z-test prop.test(x=c(7,729), n=c(13,1348), correct=FALSE) #create new datasets that have #1) scTB and no TB only (exclude active TB) #2) active TB and no TB only (exclude scTB) splitby_microTBstatus <- split(TBdataset, microTB_status) #scTB only df_scTB_only <- splitby_microTBstatus[["scTB"]] #active TB only df_activeTB_only <- splitby_microTBstatus[["activeTB"]] #noTB only df_noTB_only <- splitby_microTBstatus[["noTB"]] #Combine scTB and no TB datasets df_scTB_noTB <- rbind(df_scTB_only, df_noTB_only) #Combine activeTB and no TB datasets df_activeTB_noTB <- rbind(df_activeTB_only, df_noTB_only) #Univariate regression analysis for scTB and HIV (scTB vs no TB) attach(df_scTB_noTB) univariatemodel1 <- glm(scTB ~ binaryHIV + study_site, family = binomial, data = df_scTB_noTB) tidy(univariatemodel1, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel1) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel1_adj_clustering <- coeftest(univariatemodel1, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel1_adj_clustering, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel1_adj_clustering) exp(0.764) exp(0.141) exp(1.39) #Univariate regression analysis for scTB and province (scTB vs no Tb) df_scTB_noTB$study_site <- relevel(factor(df_scTB_noTB$study_site), ref="Mangaung") univariatemodel2 <- glm(scTB ~ study_site, family = binomial, data = df_scTB_noTB) tidy(univariatemodel2, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel2) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel2_adj_clustering <- coeftest(univariatemodel2, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel2_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(-0.0468) exp(-0.651) exp(0.557) #Univariate regression analysis for scTB and sex (scTB vs no TB) df_scTB_noTB$h_sex <- relevel(factor(df_scTB_noTB$h_sex), ref="Male") univariatemodel3 <- glm(scTB ~ h_sex + study_site, family = binomial, data = df_scTB_noTB) tidy(univariatemodel3, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel3) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel3_adj_clustering <- coeftest(univariatemodel3, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel3_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(0.495) exp(-0.170) exp(1.16) #Univariate regression analysis for scTB and age (scTB vs no TB) univariatemodel4 <- glm(scTB ~ h_age + study_site, family = binomial, data = df_scTB_noTB) tidy(univariatemodel4, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel4) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel4_adj_clustering <- coeftest(univariatemodel4, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel4_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(-0.000901) exp(-0.0143) exp(0.0125) #Univariate regression analysis for scTB and index case HIV status (scTB vs noTB) #ensure NAs recognised #3 = HIV positive and 1 = HIV negative df_scTB_noTB <- df_scTB_noTB %>% mutate(i_hiv = ifelse(i_hiv == "NA",NA,i_hiv)) i_hiv<- factor(i_hiv) df_scTB_noTB$i_hiv <- relevel(factor(df_scTB_noTB$i_hiv), ref=1) univariatemodel5 <- glm(scTB ~ i_hiv + study_site, family = binomial, data = df_scTB_noTB) tidy(univariatemodel5, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel5) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel5_adj_clustering <- coeftest(univariatemodel5, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel5_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp( -0.00594) exp(-0.618) exp(0.606) #Univariate regression analysis for active TB and HIV (active TB vs noTB) attach(df_activeTB_noTB) univariatemodel6 <- glm(activeTB ~ binaryHIV + study_site, family = binomial, data = df_activeTB_noTB) tidy(univariatemodel6, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel6) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel6_adj_clustering <- coeftest(univariatemodel6, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel6_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(1.53) exp( 0.655) exp(2.40) #Univariate regression analysis for active TB and province (active TB vs noTB) df_activeTB_noTB$study_site <- relevel(factor(df_activeTB_noTB$study_site), ref="Mangaung") univariatemodel7 <- glm(activeTB ~ study_site, family = binomial, data = df_activeTB_noTB) tidy(univariatemodel7, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel7) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel7_adj_clustering <- coeftest(univariatemodel7, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel7_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(-0.329) exp(-1.25) exp(0.588) #Univariate regression analysis for active TB and sex (active TB vs noTB) df_activeTB_noTB$h_sex <- relevel(factor(df_activeTB_noTB$h_sex), ref="Male") univariatemodel8 <- glm(activeTB ~ h_sex + study_site, family = binomial, data = df_activeTB_noTB) tidy(univariatemodel8, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel8) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel8_adj_clustering <- coeftest(univariatemodel8, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel8_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(0.0187) exp(-0.904) exp(0.941) #Univariate regression analysis for active TB and age as continuous variable (active TB vs noTB) univariatemodel9 <- glm(activeTB ~ h_age + study_site, family = binomial, data = df_activeTB_noTB) tidy(univariatemodel9, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel9) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel9_adj_clustering <- coeftest(univariatemodel9, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel9_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(0.0212) exp(0.00435) exp(0.0380) #Univariate regression analysis for active TB and index case HIV status (active TB vs noTB) #ensure NAs recognised #3 = HIV positive and 1 = HIV negative df_activeTB_noTB <- df_activeTB_noTB %>% mutate(i_hiv = ifelse(i_hiv == "NA",NA,i_hiv)) i_hiv<- factor(i_hiv) df_activeTB_noTB$i_hiv <- relevel(factor(df_activeTB_noTB$i_hiv), ref=1) univariatemodel10 <- glm(activeTB ~ i_hiv + study_site, family = binomial, data = df_activeTB_noTB) tidy(univariatemodel10, exponentiate=TRUE, conf.int=TRUE) summary(univariatemodel10) #adjust for clustering #use coeftest function to get robust clustered standard errors univariatemodel10_adj_clustering <- coeftest(univariatemodel10, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(univariatemodel10_adj_clustering, exponentiate=TRUE, conf.int=TRUE) exp(-0.889) exp(-1.81) exp(0.0308) #Checking for multicollinearity #Generating VIF statistics #quick model with the pre-selected independent variables, to check VIF on attach(TBdataset) TBdataset <- TBdataset %>% mutate(i_hiv = ifelse(i_hiv == "NA",NA,i_hiv)) i_hiv<- factor(i_hiv) VIFmodel <- glm(scTB ~ study_site + h_sex + h_age + binaryHIV + i_hiv, family = binomial, data = TBdataset) vif(VIFmodel) #multivariable logistic regression scTB vs no TB, adjusted for clustering at household level attach(df_scTB_noTB) multivariable_model1 <- glm(scTB ~ study_site + h_sex + h_age + binaryHIV + i_hiv, family = binomial, data = df_scTB_noTB) tidy(multivariable_model1, exponentiate=TRUE, conf.int=TRUE) summary(multivariable_model1) multivariable_model1_robustclustered <- coeftest(multivariable_model1, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(multivariable_model1_robustclustered, conf.int = TRUE, exponentiate = TRUE) exp(0.0778) exp(-0.560) exp( 0.716) exp(0.360) exp(-0.324) exp(1.04) exp(-0.00448) exp(-0.0204) exp(0.0114) exp( 0.692) exp(-0.00556) exp(1.39) exp(-0.0698) exp(-0.704) exp(0.564) #multivariable logistic regression active TB vs no TB,adjusted for clustering at household level attach(df_activeTB_noTB) multivariable_model2 <- glm(activeTB ~ study_site + h_sex + h_age + binaryHIV + i_hiv, family = binomial, data = df_activeTB_noTB) tidy(multivariable_model2, exponentiate=TRUE, conf.int=TRUE) summary(multivariable_model2) multivariable_model2_robustclustered <- coeftest(multivariable_model2, vcov = vcovCL, type = "HC1", cluster = ~ hhid, family=binomial) tidy(multivariable_model2_robustclustered, conf.int = TRUE, exponentiate = TRUE) exp(-0.200) exp(-1.14) exp( 0.741) exp(-0.406) exp(-1.28) exp(0.473) exp(0.0210) exp(-0.000159) exp(0.0422) exp(1.62) exp(0.794) exp(2.45) exp(-1.02) exp(-1.93) exp(-0.111)