--- title: "Datacleaning" author: "R Goto" date: "2024-07-01" output: html_document --- ## 3.1. Settings ```{r eval=FALSE} library(tidyverse) library(summarytools) options(scipen = 10, digits=3) ``` ## 3.2. Dataset **hhcons.csv** - see Section 1.3.3. (60 food item consumption in 3290HHs) **hhafe.csv** - see Section 2.6.5. (total AFE in each HH) ```{r eval=FALSE} hhcons <- read_csv("hhcons.csv") hhafe <- read_csv("hhafe.csv") ``` ## 3.3. Outliers ### 3.3.1. calculate food consumption amount in grams/day per AFE for 60 food items in HHs ```{r eval=FALSE} # calculate food consumption in grams per day per AFE for each 60 food items afecons <- left_join(hhcons, hhafe, by='y4_hhid') # merge with AFE in hh afecons <- afecons %>% mutate(hh_g_afe=cons_g_hh/afehh/7) %>% # reported consumption amount per AFE mutate(hh_g_afe_p = consp_g_hh/afehh/7) # purchased amount per AFE # identify HHs which did not calculate food consumption amount in grams/day per AFE for 60 food items in HHs count_hh_g_afe <- afecons %>% filter(hh_g_afe>0) %>% distinct(y4_hhid) %>% mutate(n3279 = 1) count_afecons <- afecons %>% distinct(y4_hhid) %>% mutate(n3290 = 1) list11 <- left_join(count_afecons, count_hh_g_afe, by = 'y4_hhid') list11 <- list11 %>% filter(is.na(n3279)) %>% mutate(n11=1) %>% select(y4_hhid, n11) write_csv(afecons, "afecons.csv") # only hh food consumed ## check the differences between consumption and purchased amounts # maize flour - check the difference between consumption (hh_g_afe) and purchased (hh_g_afe_p) dif_maize <- afecons %>% filter(itemcode==105) %>% mutate(dif_maize=hh_g_afe - hh_g_afe_p) view(dfSummary(dif_maize)) # wheat flour dif_wheatf <- afecons %>% filter(itemcode == 1081) %>% mutate(dif_wheatf=hh_g_afe - hh_g_afe_p) view(dfSummary(dif_wheatf)) # oils - check difference dif_oil <- afecons %>% filter(itemcode == 1001) %>% mutate(dif_oil=hh_g_afe - hh_g_afe_p) view(dfSummary(dif_oil)) # it can be assumed that almost no consumption of maize flour, wheat flour (not including wheat flour products) and oils were purchased. (reported non-purchased consumption was very low in maize flour (0.4% of number of consumption), wheat flour (0.3%) and oil (0.1%)) ``` ### 3.3.2. normalise the distribution using log10 transformation ```{r eval=FALSE} afecons <- afecons %>% mutate(lghh_g_afe=log10(hh_g_afe)) ``` ### 3.3.3. correct data ```{r eval=FALSE} # case 1: #810 PACKAGE FISH (n=2) cannot calculate median to replace - one outlier in package fish (n=2) - replace unit # check the value packf <- afecons %>% filter(itemcode==810 & cons_yn==1) afecons$cons_unit[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 2 afecons$conv_fac[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1 afecons$cons_g[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1500 afecons$cons_g_hh[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1500 afecons$hh_g_afe[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 34.067 # 1500/6.29/7 = 34.067 afecons$lghh_g_afe[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1.53 # log10(34.067) = 1.53 # case 2: #702 CITRUS FRUITS in y4_hhid 5132-001 - no consumption but recorded as 0, not NA afecons$cons_quant[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA afecons$cons_g[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA afecons$cons_g_hh[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA afecons$hh_g_afe[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA afecons$lghh_g_afe[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA afecons$cons_yn[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- 2 ``` ### 3.3.4. get the summary to calculate cut-off point at +3SDs ```{r eval=FALSE} dfcut <- afecons %>% group_by(itemcode, itemname) %>% summarise( mean = mean(lghh_g_afe, na.rm = TRUE), median = median(lghh_g_afe, na.rm = TRUE), sd = sd(lghh_g_afe, na.rm = TRUE), max = max(lghh_g_afe, na.rm = TRUE)) dfcutoff <- dfcut %>% mutate(sd3=mean+sd*3) %>% select(itemcode, sd3) ``` ### 3.3.5. check each values whether it is in or out from the cut-off point ```{r eval=FALSE} dfout <- left_join(afecons, dfcutoff, by="itemcode") dfinout <- dfout %>% mutate(dif3 = sd3 - lghh_g_afe) %>% mutate(inout3 = ifelse(dif3>0, 1, 2)) # in = 1, out = 2 view(dfSummary(dfinout)) # outliers 0.2% ``` ### 3.3.6. count frequency of food consumption in the HHs ```{r eval=FALSE} foodfreq <- dfinout %>% group_by(itemcode, itemname) %>% count(cons_yn) %>% filter(cons_yn==1) %>% arrange(desc(n)) ``` ### 3.3.7. calculate median to replace outliers ```{r eval=FALSE} dfinout3 <- dfinout %>% select(y4_hhid, itemcode, inout3) # making a list of in and out from +3SDs afeconsA <- left_join(afecons, dfinout3, by = c('y4_hhid', 'itemcode')) repmed <- afeconsA %>% filter(inout3==1)%>% # within the cases of in, median of each food item was calculated group_by(itemcode) %>% summarise(median = median(hh_g_afe, na.rm = TRUE)) afeconsB <- left_join(afeconsA, repmed, by = 'itemcode') afeconsC <- afeconsB %>% mutate(hh_g_afe_c = if_else(inout3==1, hh_g_afe, if_else(inout3==2, median, NA))) # median replaced afeconsD <- left_join(afeconsC, list11, by = 'y4_hhid') # pick up 11HHs afeconsE <- afeconsD %>% mutate(hh_g_afe_c1 = if_else(n11==1&cons_g_hh>0, median, NA)) # replaced median for food consumption # combine values of hh_g_afe_c and hh_g_afe_c1 in one column g_afe afeconsF <- afeconsE %>% rowwise() %>% mutate(g_afec=if_else(is.na(hh_g_afe_c)&is.na(hh_g_afe_c1), NA, sum(hh_g_afe_c, hh_g_afe_c1, na.rm=TRUE))) # check the data distribution ggplot(afeconsF, aes(x=as.factor(itemname), y=g_afec)) + geom_boxplot(fill="slateblue", alpha=0.2) + xlab("items") + ylab("consumption in grams per AFE") + coord_flip() # organised cleaned dataset with food consumption in g per day per AFE (g_afec) f60afec <- afeconsF %>% select(y4_hhid, itemcode, itemname, cons_yn, afehh, g_afec) write_csv(f60afec, "f60afec.csv") ``` ### 3.3.6. count frequency of food consumption in the HHs ```{r eval=FALSE} foodfreq <- dfinout %>% group_by(itemcode, itemname) %>% count(cons_yn) %>% filter(cons_yn==1) %>% arrange(desc(n)) ``` ### 3.3.7. calculate median to replace outliers to make a cleaned food consumption per AFE ```{r eval=FALSE} dfinout3 <- dfinout %>% select(y4_hhid, itemcode, inout3) # making a list of in and out from +3SDs afeconsA <- left_join(afecons, dfinout3, by = c('y4_hhid', 'itemcode')) repmed <- afeconsA %>% filter(inout3==1)%>% # within the cases of in, median of each food item was calculated group_by(itemcode) %>% summarise(median = median(hh_g_afe, na.rm = TRUE)) afeconsB <- left_join(afeconsA, repmed, by = 'itemcode') afeconsC <- afeconsB %>% mutate(hh_g_afe_c = if_else(inout3==1, hh_g_afe, if_else(inout3==2, median, NA))) # median replaced afeconsD <- left_join(afeconsC, list11, by = 'y4_hhid') # pick up 11HHs afeconsE <- afeconsD %>% mutate(hh_g_afe_c1 = if_else(n11==1&cons_g_hh>0, median, NA)) # replaced median for food consumption # combine values of hh_g_afe_c and hh_g_afe_c1 in one column g_afe afeconsF <- afeconsE %>% rowwise() %>% mutate(g_afec=if_else(is.na(hh_g_afe_c)&is.na(hh_g_afe_c1), NA, sum(hh_g_afe_c, hh_g_afe_c1, na.rm=TRUE))) # check the data distribution ggplot(afeconsF, aes(x=as.factor(itemname), y=g_afec)) + geom_boxplot(fill="slateblue", alpha=0.2) + xlab("items") + ylab("consumption in grams per AFE") + coord_flip() ```