## Summary descriptives ## ## Summary descriptives ## library(foreign) library(dplyr) library(ggplot2) library(scales) library (lubridate) library (tidyverse) library(plotly) library(readstata13) library(openxlsx) library(data.table) library(fs) library(readxl) library(stats) library(feathers) library(tibble) library(scales) library(tsModel) ; library("lmtest") ; library("Epi") library("splines") ; library("vcd") #install.packages("feathers") #### 1. Set working directory #### setwd("C://Users//suare//OneDrive//LSHTM//data compass BJOG") rm(list=ls(all=TRUE)) #### 2. Load data #### data<- read.xlsx("Data_and_workings_7th_Jun_2022_2.xlsx", 1) options(scipen=999) ## turns off scientific notation #### 3. Explore dataset #### dim(data) ## 264 85 names(data) head(data) str(data) length( unique(data$Countries_Territories)) ## 23 countries and territories ## Baseline characteristics ## ##Please note that some variables are only available for 2016-2019 in Australia ## ## and from 2020 and 2021 in Iran ## ##Sex## str(data$Sex_Male)# numeric str(data$Sex_Female) # numeric sex<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Sex_Male,Sex_Female)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Sex_Male=sum(Sex_Male), sum_Sex_Female=sum(Sex_Female))%>% dplyr::mutate(Male_P=(sum_Sex_Male/sum_Total_LB_2*100), Female_P=(sum_Sex_Female/sum_Total_LB_2*100)) ## Schooling ## str(data$Primary_Secondary) # data$Primary_Secondary<-as.numeric(data$Primary_Secondary) str(data$Upper_secondary_professional) data$Upper_secondary_professional<-as.numeric(data$Upper_secondary_professional) str(data$Bachelors_above) data$Bachelors_above<-as.numeric(data$Bachelors_above) str(data$Missing_education) schooling<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Primary_Secondary,Upper_secondary_professional, Bachelors_above,Missing_education)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Primary_Secondary=sum(Primary_Secondary), sum_Upper_secondary_professional=sum(Upper_secondary_professional), sum_Bachelors_above=sum(Bachelors_above), sum_Missing_education=sum(Missing_education))%>% dplyr::mutate(Primary_Secondary_P=(sum_Primary_Secondary/sum_Total_LB_2*100), Upper_secondary_professional_P=(sum_Upper_secondary_professional/sum_Total_LB_2*100), Bachelors_above_P=(sum_Bachelors_above/sum_Total_LB_2*100), Missing_education_P=(sum_Missing_education/sum_Total_LB_2*100)) #view(schooling) ### Maternal age ### str(data$Age._below_15y) # data$Age._below_15y<-as.numeric(data$Age._below_15y) str(data$Age_15_19y) str(data$Age_20_24y) str(data$Age_25_29y) str(data$Age_30_34y) str(data$Age_35_39y) str(data$Age_above_equal_40y) str(data$Missing_age_mother) data$Missing_age_mother<-as.numeric(data$Missing_age_mother) age_mother<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Age._below_15y,Age_15_19y,Age_20_24y,Age_25_29y,Age_30_34y,Age_35_39y, Age_above_equal_40y, Missing_age_mother)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Age_below_15y=sum(Age._below_15y), sum_Age_15_19y=sum(Age_15_19y), sum_Age_20_24y=sum(Age_20_24y), sum_Age_25_29y=sum(Age_25_29y), sum_Age_30_34y=sum(Age_30_34y), sum_Age_35_39y=sum(Age_35_39y), sum_Age_above_equal_40y=sum(Age_above_equal_40y), sum_Missing_age_mother=sum(Missing_age_mother))%>% dplyr::mutate(Age_below_or_equal_19y_P =((sum_Age_below_15y+sum_Age_15_19y)/sum_Total_LB_2*100), Age_20_34y_P=((sum_Age_20_24y+ sum_Age_25_29y+ sum_Age_30_34y)/sum_Total_LB_2*100), Age_above_or_equal_35_P=((sum_Age_35_39y+ sum_Age_above_equal_40y)/sum_Total_LB_2*100), Missing_Age= (sum_Missing_age_mother/sum_Total_LB_2*100)) #view(age_mother) ## Place of delivery ## str(data$Delivery_outside_facility) # str(data$Delivery_health_facility) str(data$Missing_place_delivery) data$Missing_place_delivery<-as.numeric(data$Missing_place_delivery) Place_of_delivery<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Delivery_outside_facility,Delivery_health_facility, Missing_place_delivery)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Delivery_outside_facility=sum(Delivery_outside_facility), sum_Delivery_health_facility=sum(Delivery_health_facility), sum_Missing_place_delivery=sum(Missing_place_delivery))%>% dplyr::mutate(Delivery_outside_facility_P=(sum_Delivery_outside_facility/sum_Total_LB_2*100), Delivery_health_facility_P=(sum_Delivery_health_facility/sum_Total_LB_2*100), Missing_place_delivery_P=(sum_Missing_place_delivery/sum_Total_LB_2*100)) #view(Place_of_delivery) ## Mode of delivery ## str(data$Vaginal_delivery) # str(data$Cesarean_section) str(data$Misisng_type_delivery) Mode_of_delivery<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Vaginal_delivery,Cesarean_section, Misisng_type_delivery)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Vaginal_delivery=sum(Vaginal_delivery), sum_Cesarean_section=sum(Cesarean_section), sum_Misisng_type_delivery=sum(Misisng_type_delivery))%>% dplyr::mutate(Vaginal_delivery_P=(sum_Vaginal_delivery/sum_Total_LB_2*100), Cesarean_section_P=(sum_Cesarean_section/sum_Total_LB_2*100), Misisng_type_delivery_P=(sum_Misisng_type_delivery/sum_Total_LB_2*100)) #view(Mode_of_delivery) ## Parity ## str(data$Parity_1) # str(data$Parity_2) str(data$Parity_3) str(data$Parity_4) str(data$Parity_5_more) str(data$Missing_parity) Parity<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Parity_1,Parity_2, Parity_3,Parity_4,Parity_5_more, Missing_parity)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Parity_1=sum(Parity_1), sum_Parity_2=sum(Parity_2), sum_Parity_3=sum(Parity_3), sum_Parity_4=sum(Parity_4), sum_Parity_5_more=sum(Parity_5_more), sum_Missing_parity=sum(Missing_parity))%>% dplyr::mutate(Parity_1_P=(sum_Parity_1/sum_Total_LB_2*100), Parity_2_P=(sum_Parity_2/sum_Total_LB_2*100), Parity_3_more_P=((sum_Parity_3+ sum_Parity_4 +sum_Parity_5_more)/sum_Total_LB_2*100), Missing_parity_P=(sum_Missing_parity/sum_Total_LB_2*100)) #view(Parity) ## Multiplicity ## str(data$Singleton) # str(data$Twins) str(data$Triplets_higher) str(data$Missing_multiple) Multiplicity<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,Singleton,Twins, Triplets_higher,Missing_multiple)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_Singleton=sum(Singleton), sum_Twins=sum(Twins), sum_Triplets_higher=sum(Triplets_higher), sum_Missing_multiple=sum(Missing_multiple))%>% dplyr::mutate(Singleton_P=(sum_Singleton/sum_Total_LB_2*100), Twins_P=(sum_Twins/sum_Total_LB_2*100), Triplets_higher_P=(sum_Triplets_higher/sum_Total_LB_2*100), Missing_multiple_P=(sum_Missing_multiple/sum_Total_LB_2*100)) #view(Multiplicity) #### 4. Assess quality variables , by country #### sum(data$Total_LB_2) # 169,906,956 total livebirths QA<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( Total_LB_2,LB._Below_500g,LB_Below_1000g, LB._Below_28_w, Missing_BW_only, Missing_GA_only, Implausible_BW, Heaping_index)) %>% dplyr::summarise(sum_Total_LB_2=sum(Total_LB_2), sum_LB._Below_500g=sum(LB._Below_500g), sum_LB_Below_1000g=sum(LB_Below_1000g), sum_LB._Below_28_w=sum(LB._Below_28_w), sum_Missing_BW_only=sum(Missing_BW_only), sum_Missing_GA_only=sum(Missing_GA_only), sum_Implausible_BW=sum(Implausible_BW), Heaping_index=mean(Heaping_index)) Table_2<-QA %>% mutate_at(vars(sum_Total_LB_2:sum_Implausible_BW) , funs(P = ./QA$sum_Total_LB_2 * 100)) #view(Table_2) #write.csv(Table_2,"Table_2.csv", row.names = FALSE) #### 5. Calculate prevalence by ten vulnerable newborn types, by country #### ## Check these variables as numeric variables ## str(data$SGA_T_NBW) str(data$AGA_T_NBW) str(data$LGA_T_NBW) str(data$SGA_T_LBW) data$SGA_T_LBW<-as.numeric(data$SGA_T_LBW) str(data$AGA_T_LBW) str(data$LGA_T_LBW) str(data$SGA_PT_NBW) str(data$AGA_PT_NBW) str(data$LGA_PT_NBW) str(data$SGA_PT_LBW) str(data$AGA_PT_LBW) str(data$LGA_PT_LBW) ### Livebirths with phenotypes calculated ### data$total_livebirths_phenotypes <- rowSums(data[ , c(72:81)], na.rm=TRUE) sum(data$total_livebirths_phenotypes) # 165017419 str(data$total_livebirths_phenotypes) ### Calculate prevalence by 10 newborn types, by country ### phenotypes<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( total_livebirths_phenotypes, SGA_PT_LBW, AGA_PT_LBW, AGA_PT_NBW, LGA_PT_LBW, LGA_PT_NBW, SGA_T_LBW, SGA_T_NBW, AGA_T_LBW, AGA_T_NBW, LGA_T_NBW)) %>% dplyr::summarise(sum_total_livebirths_phenotypes=sum(total_livebirths_phenotypes), sum_SGA_PT_LBW=sum(SGA_PT_LBW), sum_AGA_PT_LBW=sum(AGA_PT_LBW), sum_AGA_PT_NBW=sum(AGA_PT_NBW), sum_LGA_PT_LBW=sum(LGA_PT_LBW), sum_LGA_PT_NBW=sum(LGA_PT_NBW), sum_SGA_T_LBW=sum(SGA_T_LBW), sum_SGA_T_NBW=sum(SGA_T_NBW), sum_AGA_T_LBW=sum(AGA_T_LBW), sum_AGA_T_NBW=sum(AGA_T_NBW), sum_LGA_T_NBW=sum(LGA_T_NBW)) #view(phenotypes) Table_2<-QA %>% mutate_at(vars(sum_Total_LB_2:sum_Implausible_BW) , funs(P = ./QA$sum_Total_LB_2 * 100)) Table_10_phenotypes_by_country<-phenotypes %>% mutate_at(vars(sum_total_livebirths_phenotypes:sum_LGA_T_NBW) , funs(P = ./phenotypes$sum_total_livebirths_phenotypes * 100)) #view(Table_10_phenotypes_by_country) #write.csv(Table_10_phenotypes,"Table_ten_phenotypes_by_country.csv", row.names = FALSE) #### 6. Calculate prevalence by six newborn types, by country #### phenotypes_six<- data %>% group_by(Countries_Territories) %>% dplyr::select(c( total_livebirths_phenotypes, SGA_PT_LBW, AGA_PT_LBW, AGA_PT_NBW, LGA_PT_LBW, LGA_PT_NBW, SGA_T_LBW, SGA_T_NBW, AGA_T_LBW, AGA_T_NBW, LGA_T_NBW)) %>% dplyr::summarise(sum_total_livebirths_phenotypes=sum(total_livebirths_phenotypes), sum_SGA_PT=sum(SGA_PT_LBW), sum_AGA_PT=sum(AGA_PT_LBW) + sum(AGA_PT_NBW), sum_LGA_PT=sum(LGA_PT_LBW) + sum(LGA_PT_NBW), sum_SGA_T=sum(SGA_T_LBW) + sum(SGA_T_NBW), sum_AGA_T=sum(AGA_T_LBW) + sum(AGA_T_NBW), sum_LGA_T=sum(LGA_T_NBW)) Table_six_phenotypes_by_country<-phenotypes_six %>% mutate_at(vars(sum_total_livebirths_phenotypes:sum_LGA_T) , funs(P = ./phenotypes_six$sum_total_livebirths_phenotypes * 100)) #write.csv(Table_six_phenotypes_by_country,"Table_six_phenotypes_by_country.csv", row.names = FALSE) #view(Table_six_phenotypes_by_country) ### Box plots with median prevalence, by six types and country ### data_test<-as.data.frame(Table_six_phenotypes_by_country) names(data_test) str(data_test) dim(data_test) ## 23 15 data_test<- data_test %>% dplyr::select(c(Countries_Territories, sum_SGA_PT_P, sum_AGA_PT_P, sum_LGA_PT_P, sum_SGA_T_P, sum_AGA_T_P, sum_LGA_T_P)) colnames(data_test)<- c("country", "PT+SGA", "PT+AGA", "PT+LGA", "T+SGA", "T+AGA", "T+LGA") ## to large data_2 <- melt(data_test, id.vars ="country", variable.name = "types", value.name = "percentage", variable.factor = TRUE) ##### Median and IQR of national prevalence of six types ### g1<-ggplot(data_2, aes(x=types, y=percentage, color=types)) + geom_boxplot() + labs(x="Newborn type", y="Percentage of live births (%)")+ stat_summary(fun=median, geom="point", col="black")+ stat_summary(fun=median, geom="text", col="black", vjust=1.5, aes(label=paste("", round(..y..,digits=1 )))) g1+scale_color_manual(values = c("T+LGA"="#9E8960", "T+AGA"="#92AA70", "T+SGA"="#E29F5C", "PT+LGA"="#570951", "PT+AGA"="#FC08C8", "PT+SGA" = "#FFA3FF"))+ theme(panel.background = element_blank()) + theme(axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'), axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid')) ## Plot the prevalence of six VN types, by country ## g2<-ggplot(data_2, aes(fill=factor(types, levels=c("T+LGA","T+AGA","T+SGA","PT+LGA","PT+AGA","PT+SGA")), y=percentage, x=country)) + geom_bar(position='fill', stat='identity') + theme_minimal() + labs(x='Countries', y='Percentage of livebirths (%)', title='Prevalence of six types by country') + theme(plot.title = element_text(hjust=0.5, size=20, face='bold')) + scale_fill_manual('types', values = c( "T+LGA"="#9E8960", "T+AGA"="#92AA70", "T+SGA"="#E29F5C", "PT+LGA"="#570951", "PT+AGA"="#FC08C8", "PT+SGA" = "#FFA3FF")) g2 + scale_y_continuous(labels = percent) #### 7. Estimate Temporal trends #### data_test<- data %>% group_by(Countries_Territories) %>% select(c(SDGRegionrev1, Year, SGA_PT_LBW, AGA_PT_LBW, AGA_PT_NBW, LGA_PT_LBW, LGA_PT_NBW, SGA_T_LBW, SGA_T_NBW, AGA_T_LBW, LGA_T_NBW, total_livebirths_phenotypes)) ## As data frame data_test<-as.data.frame(data_test) data_test$small <- rowSums(data_test[ , c(4:11)], na.rm=TRUE) names(data_test) #view(data_test) ## calculate percentage for small and large data_test<-data_test %>% mutate_at(vars(LGA_T_NBW:small) , funs(P = ./data_test$total_livebirths_phenotypes * 100)) ## Exclude countries with less than 3 years of observation ## data_test <-data_test %>% filter (Countries_Territories!="Argentina" ) data_test <-data_test %>% filter(Countries_Territories!="CzechRepublic") data_test <-data_test %>% filter(Countries_Territories!="Northern Ireland") ## Plot small vulnerable babies (small: preterm or SGA) ### theme_object <- theme_classic(base_family = "serif", base_size = 15) str(data_test$Year) data_test$Year<-as.numeric(data_test$Year) g3<-data_test %>% group_by(Countries_Territories) %>% mutate(values = small_P, Rolling_Average = rollmean(x = Year, k = 3, align = "right", fill = NA))%>% ungroup() %>% ggplot(aes(x = Year, y = small_P)) + geom_line(alpha = 0) + geom_smooth(aes(color=as.factor(Countries_Territories)), method='loess', se=TRUE) g3+ scale_y_continuous(breaks = seq(0,30,10),limits = c(0,30)) + scale_x_continuous(breaks = seq(2000,2020,5),limits = c(2000,2020)) g3+ theme_object g3 + facet_wrap(~ SDGRegionrev1+Countries_Territories) ## Large babies (Term+LGA babies) g4<-data_test %>% group_by(Countries_Territories) %>% mutate(values = LGA_T_NBW_P, Rolling_Average = rollmean(x = Year, k = 3, align = "right", fill = NA))%>% ungroup() %>% ggplot(aes(x = Year, y = LGA_T_NBW_P)) + geom_line(alpha = 0) + geom_smooth(aes(color=as.factor(Countries_Territories)), method='loess', se=TRUE) g4+ scale_y_continuous(breaks = seq(0,30,10),limits = c(0,30)) + scale_x_continuous(breaks = seq(2000,2020,5),limits = c(2000,2020)) g4+ theme_object g4 + facet_wrap(~ SDGRegionrev1+Countries_Territories) ## End ##