---
title: "Datacleaning"
author: "R Goto"
date: "2024-07-01"
output: html_document
---

## 3.1. Settings
```{r eval=FALSE}
library(tidyverse)
library(summarytools)
options(scipen = 10, digits=3) 
```

## 3.2. Dataset
**hhcons.csv** - see Section 1.3.3. (60 food item consumption in 3290HHs)
**hhafe.csv** - see Section 2.6.5. (total AFE in each HH)
```{r eval=FALSE}
hhcons <- read_csv("hhcons.csv")
hhafe <- read_csv("hhafe.csv")
```

## 3.3. Outliers
### 3.3.1. calculate food consumption amount in grams/day per AFE for 60 food items in HHs
```{r eval=FALSE}
# calculate food consumption in grams per day per AFE for each 60 food items 
afecons <- left_join(hhcons, hhafe, by='y4_hhid') # merge with AFE in hh

afecons <- afecons %>% 
  mutate(hh_g_afe=cons_g_hh/afehh/7) %>%  # reported consumption amount per AFE
  mutate(hh_g_afe_p = consp_g_hh/afehh/7) # purchased amount per AFE

# identify HHs which did not calculate food consumption amount in grams/day per AFE for 60 food items in HHs

count_hh_g_afe <- afecons %>% 
  filter(hh_g_afe>0) %>% 
  distinct(y4_hhid) %>% 
  mutate(n3279 = 1)

count_afecons <-  afecons %>% 
  distinct(y4_hhid) %>% 
  mutate(n3290 = 1)

list11 <- left_join(count_afecons, count_hh_g_afe, by = 'y4_hhid')

list11 <- list11 %>% 
  filter(is.na(n3279)) %>% 
  mutate(n11=1) %>% 
  select(y4_hhid, n11)

write_csv(afecons, "afecons.csv") # only hh food consumed

## check the differences between consumption and purchased amounts
# maize flour - check the difference between consumption (hh_g_afe) and purchased (hh_g_afe_p)

dif_maize <- afecons %>% 
  filter(itemcode==105) %>% 
  mutate(dif_maize=hh_g_afe - hh_g_afe_p)
view(dfSummary(dif_maize))

# wheat flour
dif_wheatf <- afecons %>% 
  filter(itemcode == 1081) %>% 
  mutate(dif_wheatf=hh_g_afe - hh_g_afe_p)
view(dfSummary(dif_wheatf))

# oils - check difference
dif_oil <- afecons %>% 
  filter(itemcode == 1001) %>% 
  mutate(dif_oil=hh_g_afe - hh_g_afe_p)
view(dfSummary(dif_oil))

# it can be assumed that almost no consumption of maize flour, wheat flour (not including wheat flour products) and oils were purchased. (reported non-purchased consumption was very low in maize flour (0.4% of number of consumption), wheat flour (0.3%) and oil (0.1%))

```

### 3.3.2. normalise the distribution using log10 transformation
```{r eval=FALSE}
afecons <- afecons %>% 
  mutate(lghh_g_afe=log10(hh_g_afe))
```

### 3.3.3. correct data

```{r eval=FALSE}
# case 1: #810 PACKAGE FISH (n=2) cannot calculate median to replace - one outlier in package fish (n=2) - replace unit
# check the value
packf <- afecons %>% 
  filter(itemcode==810 & cons_yn==1)

afecons$cons_unit[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 2
afecons$conv_fac[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1
afecons$cons_g[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1500
afecons$cons_g_hh[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1500
afecons$hh_g_afe[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 34.067 # 1500/6.29/7 = 34.067
afecons$lghh_g_afe[afecons$itemcode==810 & afecons$y4_hhid=='6897-001'] <- 1.53 # log10(34.067) = 1.53

# case 2: #702 CITRUS FRUITS in y4_hhid 5132-001 - no consumption but recorded as 0, not NA

afecons$cons_quant[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA
afecons$cons_g[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA
afecons$cons_g_hh[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA
afecons$hh_g_afe[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA
afecons$lghh_g_afe[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- NA
afecons$cons_yn[afecons$y4_hhid=='5132-001'&afecons$itemcode=='702'] <- 2
```

### 3.3.4. get the summary to calculate cut-off point at +3SDs
```{r eval=FALSE}
dfcut <- afecons %>% 
  group_by(itemcode, itemname) %>%
  summarise(
    mean = mean(lghh_g_afe, na.rm = TRUE),
    median = median(lghh_g_afe, na.rm = TRUE),
    sd = sd(lghh_g_afe, na.rm = TRUE),
    max = max(lghh_g_afe, na.rm = TRUE)) 

dfcutoff <- dfcut %>% 
  mutate(sd3=mean+sd*3) %>% 
  select(itemcode, sd3)

```

### 3.3.5. check each values whether it is in or out from the cut-off point
```{r eval=FALSE}
dfout <- left_join(afecons, dfcutoff, by="itemcode")

dfinout <- dfout %>% 
  mutate(dif3 = sd3 - lghh_g_afe) %>% 
  mutate(inout3 = ifelse(dif3>0, 1, 2))  # in = 1, out = 2

view(dfSummary(dfinout))   # outliers 0.2%

```

### 3.3.6. count frequency of food consumption in the HHs

```{r eval=FALSE}
foodfreq <- dfinout %>% 
  group_by(itemcode, itemname) %>% 
  count(cons_yn) %>% 
  filter(cons_yn==1) %>% 
  arrange(desc(n))

```

### 3.3.7. calculate median to replace outliers
```{r eval=FALSE}
dfinout3 <- dfinout %>% 
  select(y4_hhid, itemcode, inout3)  # making a list of in and out from +3SDs

afeconsA <- left_join(afecons, dfinout3, by = c('y4_hhid', 'itemcode'))

repmed <- afeconsA %>% 
  filter(inout3==1)%>%   # within the cases of in, median of each food item was calculated
  group_by(itemcode) %>% 
  summarise(median = median(hh_g_afe, na.rm = TRUE))

afeconsB <- left_join(afeconsA, repmed, by = 'itemcode')

afeconsC <- afeconsB %>%
  mutate(hh_g_afe_c = if_else(inout3==1, hh_g_afe, if_else(inout3==2, median, NA))) # median replaced

afeconsD <- left_join(afeconsC, list11, by = 'y4_hhid') # pick up 11HHs
afeconsE <- afeconsD %>% 
  mutate(hh_g_afe_c1 = if_else(n11==1&cons_g_hh>0, median, NA))  # replaced median for food consumption

# combine values of hh_g_afe_c and hh_g_afe_c1 in one column g_afe
afeconsF <- afeconsE %>% 
  rowwise() %>% 
  mutate(g_afec=if_else(is.na(hh_g_afe_c)&is.na(hh_g_afe_c1), NA, sum(hh_g_afe_c, hh_g_afe_c1, na.rm=TRUE)))

# check the data distribution

ggplot(afeconsF, aes(x=as.factor(itemname), y=g_afec)) + 
  geom_boxplot(fill="slateblue", alpha=0.2) + 
  xlab("items") + ylab("consumption in grams per AFE") +
  coord_flip()

# organised cleaned dataset with food consumption in g per day per AFE (g_afec)
f60afec <- afeconsF %>% 
  select(y4_hhid, itemcode, itemname, cons_yn, afehh, g_afec)

write_csv(f60afec, "f60afec.csv")

```

### 3.3.6. count frequency of food consumption in the HHs

```{r eval=FALSE}
foodfreq <- dfinout %>% 
  group_by(itemcode, itemname) %>% 
  count(cons_yn) %>% 
  filter(cons_yn==1) %>% 
  arrange(desc(n))
```

### 3.3.7. calculate median to replace outliers to make a cleaned food consumption per AFE
```{r eval=FALSE}
dfinout3 <- dfinout %>% 
  select(y4_hhid, itemcode, inout3)  # making a list of in and out from +3SDs

afeconsA <- left_join(afecons, dfinout3, by = c('y4_hhid', 'itemcode'))

repmed <- afeconsA %>% 
  filter(inout3==1)%>%   # within the cases of in, median of each food item was calculated
  group_by(itemcode) %>% 
  summarise(median = median(hh_g_afe, na.rm = TRUE))

afeconsB <- left_join(afeconsA, repmed, by = 'itemcode')

afeconsC <- afeconsB %>%
  mutate(hh_g_afe_c = if_else(inout3==1, hh_g_afe, if_else(inout3==2, median, NA))) # median replaced

afeconsD <- left_join(afeconsC, list11, by = 'y4_hhid') # pick up 11HHs
afeconsE <- afeconsD %>% 
  mutate(hh_g_afe_c1 = if_else(n11==1&cons_g_hh>0, median, NA))  # replaced median for food consumption

# combine values of hh_g_afe_c and hh_g_afe_c1 in one column g_afe
afeconsF <- afeconsE %>% 
  rowwise() %>% 
  mutate(g_afec=if_else(is.na(hh_g_afe_c)&is.na(hh_g_afe_c1), NA, sum(hh_g_afe_c, hh_g_afe_c1, na.rm=TRUE)))

# check the data distribution

ggplot(afeconsF, aes(x=as.factor(itemname), y=g_afec)) + 
  geom_boxplot(fill="slateblue", alpha=0.2) + 
  xlab("items") + ylab("consumption in grams per AFE") +
  coord_flip()

```