/*Program 1: Compiling all translated survey responses into one working dataset

-Download responses from Typeform.com
-Import responses to Stata with no variable header and save raw data as Stata file
-Seperate the languages into seperate Stata files and save as seperate Excel files
-Upload each language Excel file to Google Sheets to create common variable names, confirm presence of all variables, and translate variables where necesseary.
-Download each Google Sheet as Excel File and import each language file into Stata 
-Rename variables with common names and append all languages together
-Save working doc. for use in Program 2 : "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC analysis\Datasets by language\all languages appended.dta"


1. Delete all documents in C:\Users\rache\Desktop\CCC Analysis and all XLSX files in C:\Users\rache\Downloads

2. Unblock all "export" lines in this Do-File

3. Clear google drive CCC-analysis folder
*/

//Downloading all Typeform responses - converting into Stata file - saving to LSHTM OneDrive

cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets"

clear
import excel "C:\Users\rache\Downloads\responses (4).xlsx", sheet("tm6bWbP2")
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analys> is\Built Datasets\raw.dta", clear	 
			 
			 
//Seperating the languages into seperate Stata files

//En
use raw.dta, clear
sort B
keep if B=="Hello"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\English.dta", replace


//Es
use raw.dta, clear
sort B
keep if B=="Hola"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Spanish.dta", replace

//Fr
use raw.dta, clear
sort B
keep if B=="Bonjour"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\French.dta", replace

//Sh
use raw.dta, clear
sort B
keep if B=="Niaje"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Sheng.dta", replace

//Hi
use raw.dta, clear
sort B
keep if B=="नमस्ते"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Hindi.dta", replace

//Ur
use raw.dta, clear
sort B
keep if B=="ہیلو"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Urdu.dta", replace

//It
use raw.dta, clear
sort B
keep if B=="Ciao"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Italian.dta", replace

//Bn
use raw.dta, clear
sort B
keep if B=="হ্যালো"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Bangla.dta", replace

//Ar
use raw.dta, clear
sort B
keep if B=="أهلا"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Arabic.dta", replace

use raw.dta, clear
sort B
keep if B=="你好"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages\Chinese.dta", replace

//Creating Excel files by language for translation - renaming and consolidating 'Live' with 'Live - other'

/*make sure to delete all CCC Analysis files before running

1. Check that each language file has all variables at this stage (add in blank columns where needed)
*/

cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Languages"


//En
use "English.dta", clear
list //verify that variables match list below
drop BE-TU
rename (A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z	AA	AB	AC	AD	AE	AF	AG	AH	AI	AJ	AK	AL	AM	AN	AO	AP	AQ	AR	AS	AT	AU	AV	AW	AX	AY	AZ	BA	BB	BC	BD TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	variable_0 utm_term	utm_content	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "English.dta", replace

export excel ID-Network_ID using "English.xls", replace

//Es
use "Spanish.dta", clear
list
drop C-BD //drop En survey
drop DG-TU //drop all other surveys
rename (A	B	BE	BF	BG	BH	BI	BJ	BK	BL	BM	BN	BO	BP	BQ	BR	BS	BT	BU	BV	BW	BX	BY	BZ	CA	CB	CC	CD	CE	CF	CG	CH	CI	CJ	CK	CL	CM	CN	CO	CP	CQ	CR	CS	CT	CU	CV	CW	CX	CY	CZ	DA	DB	DC	DD	DE	DF TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Spanish.dta", replace

export excel ID-Network_ID using "Spanish.xls", replace

//Fr
use "French.dta", clear
list
drop C-DF //drop En Es
drop FI-TU //drop all other surveys
rename (A B DG	DH	DI	DJ	DK	DL	DM	DN	DO	DP	DQ	DR	DS	DT	DU	DV	DW	DX	DY	DZ	EA	EB	EC	ED	EE	EF	EG	EH	EI	EJ	EK	EL	EM	EN	EO	EP	EQ	ER	ES	ET	EU	EV	EW	EX	EY	EZ	FA	FB	FC	FD	FE	FF	FG	FH TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "French.dta", replace

export excel ID-Network_ID using "French.xls", replace

//Sh
//will have to add in GU as air_quality if anyone answers with this 
use "Sheng.dta", clear
list
drop C-FH //drop all other surveys
drop HJ-TU //drop all other surveys
rename (A B FI	FJ	FK	FL	FM	FN	FO	FP	FQ	FR	FS	FT	FU	FV	FW	FX	FY	FZ	GA	GB	GC	GD	GE	GF	GG	GH	GI	GJ	GK	GL	GM	GN	GO	GP	GQ	GR	GS	GT	GU	GV	GW	GX	GY	GZ	HA	HB	HC	HD	HE	HF	HG	HH	HI TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Sheng.dta", replace

export excel ID-Network_ID using "Sheng.xls", replace

//Hi
use "Hindi.dta", clear
list
drop C-HI //drop all other surveys
drop JL-TU //drop all other surveys
rename (A B HJ	HK	HL	HM	HN	HO	HP	HQ	HR	HS	HT	HU	HV	HW	HX	HY	HZ	IA	IB	IC	ID	IE	IF	IG	IH	II	IJ	IK	IL	IM	IN	IO	IP	IQ	IR	IS	IT	IU	IV	IW	IX	IY	IZ	JA	JB	JC	JD	JE	JF	JG	JH	JI	JJ	JK TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Hindi.dta", replace

export excel ID-Network_ID using "Hindi.xls", replace

//Ur
use "Urdu.dta", clear
list
drop C-JK //drop all other surveys
drop LN-TU //drop all other surveys
rename (A B JL	JM	JN	JO	JP	JQ	JR	JS	JT	JU	JV	JW	JX	JY	JZ	KA	KB	KC	KD	KE	KF	KG	KH	KI	KJ	KK	KL	KM	KN	KO	KP	KQ	KR	KS	KT	KU	KV	KW	KX	KY	KZ	LA	LB	LC	LD	LE	LF	LG	LH	LI	LJ	LK	LL	LM TV TW TX TY TZ UA UB UC UD) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Urdu.dta", replace

export excel ID-Network_ID using "Urdu.xls", replace

//It
use "Italian.dta", clear
list
drop C-LM //drop all other surveys
drop NP-TU //drop all other surveys
rename (A B LN	LO	LP	LQ	LR	LS	LT	LU	LV	LW	LX	LY	LZ	MA	MB	MC	MD	ME	MF	MG	MH	MI	MJ	MK	ML	MM	MN	MO	MP	MQ	MR	MS	MT	MU	MV	MW	MX	MY	MZ	NA	NB	NC	ND	NE	NF	NG	NH	NI	NJ	NK	NL	NM	NN	NO TV TW TX TY TZ UA UB UC UD ) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Italian.dta", replace

export excel ID-Network_ID using "Italian.xls", replace

//Bn
use "Bangla.dta", clear
list
drop C-NO //drop all other surveys
drop PR-TU //drop all other surveys
rename (A B NP	NQ	NR	NS	NT	NU	NV	NW	NX	NY	NZ	OA	OB	OC	OD	OE	OF	OG	OH	OI	OJ	OK	OL	OM	ON	OO	OP	OQ	OR	OS	OT	OU	OV	OW	OX	OY	OZ	PA	PB	PC	PD	PE	PF	PG	PH	PI	PJ	PK	PL	PM	PN	PO	PP PQ TV TW TX TY TZ UA UB UC UD ) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Bangla.dta", replace

export excel ID-Network_ID using "Bangla.xls", replace

//Ar
use "Arabic.dta", clear
list
drop C-PQ //drop all other surveys
drop RU-TU //drop all other surveys
rename (A B PR	PS	PT	PU	PV	PW	PX	PY	PZ	QA	QB	QC	QD	QE	QF	QG	QH	QI	QJ	QK	QL	QM	QN	QO	QP	QQ	QR	QS	QT	QU	QV	QW	QX	QY	QZ	RA	RB	RC	RD	RE	RF	RG	RH	RI	RJ	RK	RL	RM	RN	RO	RP	RQ	RR	RS	RT TV TW TX TY TZ UA UB UC UD ) (ID	lang	eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	choice_1	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Arabic.dta", replace

export excel ID-Network_ID using "Arabic.xls", replace

//Zh
use "Chinese.dta", clear
list
drop C-RS //drop all other surveys 
rename (A B RT RU RV RW RX RY RZ SA SB SC SD SE SF SG SH SI SJ SK SL SM SN SO SP SQ SR SS ST SU SV SW SX SY SZ TA TB TC TD TE TF TG TH TI TJ TK TL TM TN TO TP TQ TR TS TT TU TV TW TX TY TZ UA UB UC UD ) (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	consent6	q	age	preg	live1	live2	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
replace live1 = live2 if live1=="" 
drop live2
rename live1 live
save "Chinese.dta", replace

export excel ID-Network_ID using "Chinese.xls", replace

*/
/*Variables that need to be translated in each excel dataset:

Upload each language to Google Sheets and replace each column with the translated following (retain variable names): 


1. To each language google sheet: add row 2 from https://docs.google.com/spreadsheets/d/12xpkgkrK8uW4EPK97eUPHuba12-AHEfJYGuGmAL3OEE/edit#gid=0

2. add column, and the equation: =GOOGLETRANSLATE(B32, "en", "it") in the following locations

live (add M)
best_a12 (add after Z)
worst_a13 (add after AN)
gen_city_comments (add after AP)
pollution_causes_a9 (add after BA)
gen_air_comments (add after BC)
nicer_comments (add after BF)
mayor_ans1 (add after BH)
mayor_ans2 (add after BJ)
final_comments (add after BD)

3. Copy paste translated column as special>"values only"

3. Delete original columns for each variable above

4. Find and replace "#VALUE!" with ""

5. Delete Row one variable names

5. Download each language sheet and import to Stata

6.Save as .dta for appending:




Notes: 
*For Sheng, had to add a column for "other" for the 'worst things' question - no one had submitted any response for that. 
*For urdu - some people responded in Hindi, I only translated the above variables into Hindi, so in the qual any untranslated script will need to be manually translated. 

*/


import excel "C:\Users\rache\Downloads\English.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs	age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "English.dta", replace

import excel "C:\Users\rache\Downloads\Spanish.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Spanish.dta", replace

import excel "C:\Users\rache\Downloads\French.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "French.dta", replace

import excel "C:\Users\rache\Downloads\Sheng.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Sheng.dta", replace

import excel "C:\Users\rache\Downloads\Hindi.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Hindi.dta", replace

import excel "C:\Users\rache\Downloads\Urdu.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Urdu.dta", replace

import excel "C:\Users\rache\Downloads\Italian.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Italian.dta", replace

import excel "C:\Users\rache\Downloads\Bangla.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs 	age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Bangla.dta", replace

import excel "C:\Users\rache\Downloads\Arabic.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs 	age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Arabic.dta", replace

import excel "C:\Users\rache\Downloads\Chinese.xlsx", sheet("Sheet1") clear
rename A-BL (ID lang eligibility	consent1	consent2	consent3	consent4	consent5	q	FAQs 	age	preg	live	best_a1	best_a2	best_a3	best_a4	best_a5	best_a6	best_a7	best_a8	best_a9	best_a10	best_a11	best_a12	worst_a1	worst_a2	worst_a3	worst_a4	worst_a5	worst_a6	worst_a7	worst_a8	worst_a9	worst_a10	worst_a11	worst_a12	worst_a13	gen_city_comments	air_quality	pollution_causes_a1	pollution_causes_a2	pollution_causes_a3	pollution_causes_a4	pollution_causes_a5	pollution_causes_a6	pollution_causes_a7	pollution_causes_a8	pollution_causes_a9	gen_air_comments	nicer_city	nicer_comments	mayor_ans1	mayor_ans2	final_comments	utm_source	utm_medium	utm_campaign	utm_term	utm_content	variable_0	Start_Date 	Submit_Date Network_ID)
save "Chinese.dta", replace


//appending and work on full document

/*Check that each is in string formatting
tostring pollution_causes_a9, gen(pollution_causes_a10)
drop pollution_causes_a9
rename pollution_causes_a10 pollution_causes_a9*/

use English.dta, clear
append using Spanish.dta, force
append using French.dta, force
append using Sheng.dta, force
append using Hindi.dta, force
append using Urdu.dta, force
append using Italian.dta, force
append using Bangla.dta, force
append using Arabic.dta, force
append using Chinese.dta, force
drop if ID==""
drop utm_term
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\1_languages appended.dta", replace


**************************************FINAL FILE n=5,158
use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\1_languages appended.dta", clear
clear


/*Program 2:

-Generalize all demographic variables (age buckets, live: town and region)
-Create eligible/consenting dataset and ineligible/nonconsenting dataset
-Remove all outlier data (age)
*/


cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets"

use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\1_all languages appended.dta", clear


//Consolidating demographics variables

//Pregnant
replace preg="pregnant" if strpos(preg, "Yes")
replace preg="pregnant" if strpos(preg, "sí")
replace preg="pregnant" if strpos(preg, "sì")
replace preg="pregnant" if strpos(preg, "Oui")
replace preg="pregnant" if strpos(preg, "हाँ")
replace preg="pregnant" if strpos(preg, "جی ہاں")
replace preg="pregnant" if strpos(preg, "হ্যাঁ")
replace preg="pregnant" if strpos(preg, "نعم")
replace preg="pregnant" if strpos(preg, "是")
replace preg="not pregnant" if strpos(preg, "No")
replace preg="not pregnant" if strpos(preg, "no")
replace preg="not pregnant" if strpos(preg, "Non")
replace preg="not pregnant" if strpos(preg, "नहीं")
replace preg="not pregnant" if strpos(preg, "نہیں")
replace preg="not pregnant" if strpos(preg, "না")
replace preg="not pregnant" if strpos(preg, "رقم")
replace preg="not pregnant" if strpos(preg, "否")
replace preg="Not asked" if preg==""

tab preg

//Age

tab age

gen age_real = real(age)
drop age
rename age_real age

replace age=37 if age==1984
replace age=34 if age==1987
replace age=33 if age==1988
replace age=31 if age==1990
replace age=27 if age==1994
replace age=25 if age==1996
replace age=21 if age==2000
replace age=20 if age==2001
replace age=18 if age==2003
replace age=18 if age==20003
replace age=16 if age==2005

replace age=. if age>100

gen age_bucket=1 if age<13
replace age_bucket=2 if inrange(age, 13, 16)
replace age_bucket=3 if inrange(age, 17, 19)
replace age_bucket=4 if inrange(age, 20, 25)
replace age_bucket=5 if age>25
replace age_bucket=6 if age==.
label define age_bucket_label 1 "Unknown" 2 "13-16" 3 "17-19" 4 "20-25" 5 "25+" 6 "missing or unrealistic"
label values age_bucket age_bucket_label

tab age_bucket


//Prepping location imputation variable 
gen utm_campaign_location="Bhubaneswar" if strpos(utm_campaign, "Bhubaneswar")
replace utm_campaign_location="Dar es Salaam" if strpos(utm_campaign, "Dar es Salaam")
replace utm_campaign_location="Dhaka" if strpos(utm_campaign, "Dhaka")
replace utm_campaign_location="Freetown" if strpos(utm_campaign, "Freetown")
replace utm_campaign_location="Glasgow" if strpos(utm_campaign, "Glasgow")
replace utm_campaign_location="Harare" if strpos(utm_campaign, "Harare")
replace utm_campaign_location="Jaipur" if strpos(utm_campaign, "Jaipur")
replace utm_campaign_location="Lahore" if strpos(utm_campaign, "Lahore")
replace utm_campaign_location="London" if strpos(utm_campaign, "London")
replace utm_campaign_location="Los Angeles" if strpos(utm_campaign, "Los Angeles")
replace utm_campaign_location="Mexico City" if strpos(utm_campaign, "Mexico City")
replace utm_campaign_location="Milan" if strpos(utm_campaign, "Milan")
replace utm_campaign_location="Nairobi" if strpos(utm_campaign, "Nairobi")
replace utm_campaign_location="Quezon City" if strpos(utm_campaign, "Quezon")
replace utm_campaign_location="Quito" if strpos(utm_campaign, "Quito")
replace utm_campaign_location="Tamale" if strpos(utm_campaign, "Tamale")
replace utm_campaign_location="missing" if utm_campaign==""
replace utm_campaign_location="missing" if utm_campaign=="xxxxx"
replace utm_campaign_location="CCC Youth Survey - Test Campaign V2" if utm_campaign=="CCC Youth Survey - Test Campaign V2"
tab utm_campaign_location //identifies all organic and non-campaign reponses n=341

//Live
generate town=utm_campaign_location
replace town="Alwar" if strpos(live, "Alwar")
replace town="Atrai" if strpos(live, "atrai")
replace town="Antipolo" if strpos(live, "Antipolo")
replace town="Bahawalnagar" if strpos(live, "Bahawal")
replace town="Baripada" if strpos(live, "Baripada")
replace town="Bomet" if strpos(live, "Bomet")
replace town="Bhubaneswar" if strpos(live, "Bubaneswar") 
replace town="Busia" if strpos(live, "Busia")
replace town="Caloocan" if strpos(live, "Caloocan")
replace town="Chitral" if strpos(live, "Chitral")
replace town="Chitungwiza" if strpos(live, "Chitungwiza")
replace town="Churu" if strpos(live, "Churu") 
replace town="Cobham" if strpos(live, "Cobham")
replace town="Cover" if strpos(live, "Cover")
replace town="Dhaka" if strpos(live, "haka") | strpos(live, "KAKA") 
replace town="Dar es Salaam" if strpos(live, "Dar es Salaam")
replace town="Dhani Kalera" if strpos(live, "Dhani Kalera")
replace town="Echague" if strpos(live, "Echague")
replace town="Entebbe" if strpos(live, "Entebbe")
replace town="Firozabad" if strpos(live, "Firo")
replace town="Freetown" if strpos(live, "Freetown")
replace town="Glasgow" if strpos(live, "Glasgow")
replace town="Groningen" if strpos(live, "Groningen")
replace town="Harare" if strpos(live, "arare") | strpos(live, "ARARE")
replace town="Haroonabad" if strpos(live, "Haroonabad")
replace town="Jaipur" if strpos(live, "Jaipur")
replace town="Jhang" if strpos(live, "Jhang")
replace town="Kajiado" if strpos(live, "Kajiado")
replace town="Khulna" if strpos(live, "Khulna")
replace town="Kuchaman" if strpos(live, "Kuchaman")
replace town="Lahore" if strpos(live, "Lahore") |strpos(live, "LAHORE")
replace town="London" if strpos(live, "London") 
replace town="Los Angeles" if strpos(live, "Los Angeles")
replace town="Manila" if strpos(live, "anila")
replace town="Mardan" if strpos(live, "Mardan")
replace town="Muranga" if strpos(live, "Muranga")
replace town="Marikina" if strpos(live, "Marikina")
replace town="Masvingo" if strpos(live, "Masvingo")
replace town="Mexico City" if strpos(live, "CDMX") |strpos(live, "CIUDAD DE MEXICO") | strpos(live, "City of Mexico")| strpos(live, "Mexico City")| strpos(live, "CD Mexico")
replace town="Milan" if strpos(live, "Milan")
replace town="Monza" if strpos(live, "Monza")
replace town="Mymensingh" if strpos(live, "Mymensingh")
replace town="Nairobi" if strpos(live, "airobi")
replace town="Narayanganj" if strpos(live, "Narayanganj")
replace town="Narowal" if strpos(live, "Narowal")
replace town="New York" if strpos(live, "New York")
replace town="Okara" if strpos(live, "Okara")
replace town="Quezon City" if strpos(live, "Quezon")| strpos(live, "Queson") | strpos(live, "QUEZON")
replace town="Quito" if strpos(live, "Quito")
replace town="Rourkela" if strpos(live, "rourkela")
replace town="Sargodha" if strpos(live, "SARGODHA")
replace town="San Jose Del Monte" if strpos(live, "San Jose Del Monte")
replace town="San Juan" if strpos(live, "San Juan")
replace town="Sialkot" if strpos(live, "Sialkot")
replace town="Sirsa" if strpos(live, "Sirsa")
replace town="Tamale" if strpos(live, "amal")
replace town="Tamale" if strpos(live, "AMAL") 
replace town="Valenzuela" if strpos(live, "Valenzuela")
replace town="Victoria Falls" if strpos(live, "Vic Falls")
replace town="Victoria Falls" if strpos(live, "Victoria falls")
replace town="Yeji" if strpos(live, "Yeji")
//True indication of where respondents are from  missing only 236




//City
sort town
quietly by town:  gen dup = cond(_N==1,0,_n)
generate city=utm_campaign_location if dup==0
replace city=town if city=="missing" | city=="" | strpos(city, "CCC Youth")
replace city="Nairobi" if city=="Bomet" | city=="Busia" | city=="Kajiado"
replace city="Jaipur" if city=="Churu"
replace city="Dhaka" if city=="Narayanganj"
replace city="Lahore" if city=="Okara" | city=="Sialkot"
replace city="Quezon City" if city=="San Jose Del Monte" |city=="Manila"
replace city="Harare" if city=="Victoria Falls"


//Region
generate region=""
replace region="India" if city=="Bhubaneswar" | city=="Jaipur"
replace region="Tanzania" if city=="Dar es Salaam"
replace region="Bangladesh" if city=="Dhaka" 
replace region="Sierra Leone" if city=="Freetown" | live=="SIERRA Leone"
replace region="United Kingdom" if city=="Glasgow" | city=="London"
replace region="Zimbabwe" if city=="Harare" | live=="Zimbabwe"
replace region="Pakistan" if city=="Lahore" 
replace region="United States of America" if city=="Los Angeles" 
replace region="Mexico" if city=="Mexico City" | live=="Mexico"
replace region="Italy" if city=="Milan"
replace region="Kenya" if city=="Nairobi"
replace region="Philippines" if city=="Quezon City" | live=="Philippines"
replace region="Ecuador" if city=="Quito"
replace region="Ghana" if city=="Tamale"
replace region="missing" if city=="missing" & region=="" | strpos(city, "CCC Youth")

//For those who did not report a town and utm_campaign was not available, but DID report a region
replace city="Mexico City" if town=="missing" & region=="Mexico"
replace city="Quezon City" if town=="missing" & region=="Philippines"
replace city="Freetown" if town=="missing" & region=="Sierra Leone"
replace city="Harare" if town=="missing" & region=="Zimbabwe"

//Geoscheme Sub regions
generate geoscheme_subregion=""
replace geoscheme_subregion="Eastern Africa" if region=="Zimbabwe" | region=="Kenya" | region=="Tanzania"
replace geoscheme_subregion="Western Africa" if region=="Ghana" | region=="Sierra Leone" 
replace geoscheme_subregion="North America" if region=="United States of America"
replace geoscheme_subregion="Latin America and the Caribbean" if region=="Ecuador" | region=="Mexico"
replace geoscheme_subregion="Western Europe" if region=="United Kingdom" | region=="Italy"
replace geoscheme_subregion="Southeastern Asia" if region=="Philippines" 
replace geoscheme_subregion="Southern Asia" if region=="Bangladesh" | region=="Pakistan" | region=="India"

generate geoscheme_region=""
replace geoscheme_region="Afria" if geoscheme_subregion=="Eastern Africa" | geoscheme_subregion=="Western Africa"
replace geoscheme_region="Americas" if geoscheme_subregion=="North America" | geoscheme_subregion=="Latin America and the Caribbean" 
replace geoscheme_region="Asia" if geoscheme_subregion=="Southeastern Asia" | geoscheme_subregion=="Southern Asia" 
replace geoscheme_region="Europe" if geoscheme_subregion=="Western Europe"

//OECD 
generate OECD=""
replace OECD="LMIC" if region=="Bangladesh" | region=="Ecuador" | region=="Ghana" | region=="India" | region=="Kenya" | region=="Mexico" | region=="Pakistan" | region=="Philippines" | region=="Sierra Leone" | region=="Tanzania" | region=="Zimbabwe"

replace OECD="HIC" if region=="Italy" | region=="United Kingdom" | region=="United States of America"

/*
drop dup
by city, sort : generate dup = _n == 1
keep if dup==1
keep OECD city region pm25 pm25_quartile 
tab OECD
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\City & country indicators.dta", replace

*/
*******Generating incomplete response datasets


//Mistake
generate mistake=1 if eligibility=="A parent of someone under 13 years old" & age>25 & preg=="not pregnant" 


//Eligibility
replace eligibility="Aged between 13-25 years old" if eligibility=="Tiene entre 13 y 25 años"
replace eligibility="Aged between 13-25 years old" if eligibility=="Gé de 13 à 25 ans"
replace eligibility="Aged between 13-25 years old" if eligibility=="Niko na miaka kati ya 13-25"
replace eligibility="Aged between 13-25 years old" if eligibility=="उम्र 13-25 साल के बीच"
replace eligibility="Aged between 13-25 years old" if eligibility=="میری عمر 13 سے 15 سال کے درمیان ہے۔"
replace eligibility="Aged between 13-25 years old" if eligibility=="Hai un'età compresa tra 13 e 25 anni"
replace eligibility="Aged between 13-25 years old" if eligibility=="13-25 বছর বয়সী"
replace eligibility="Aged between 13-25 years old" if eligibility=="تتراوح أعمارهم بين 13-25 سنة"
replace eligibility="Aged between 13-25 years old" if eligibility=="年龄在 13-25 岁之间"

replace eligibility="A parent of someone under 13 years old" if eligibility=="Es un padre de alguien menor de 13 años"
replace eligibility="A parent of someone under 13 years old" if eligibility=="Un parent d'une personne de moins de 13 ans"
replace eligibility="A parent of someone under 13 years old" if eligibility=="Mimi ni mzazi wa mtoi ako chini ya miaka 13"
replace eligibility="A parent of someone under 13 years old" if eligibility=="13 वर्ष से कम आयु के व्यक्ति के माता-पिता"
replace eligibility="A parent of someone under 13 years old" if eligibility=="میں 13 سال سے کم عمر کے بچے کا والدین ہوں۔"
replace eligibility="A parent of someone under 13 years old" if eligibility=="Sei genitore di un bambino di età inferiore ai 13 anni" 
replace eligibility="A parent of someone under 13 years old" if eligibility=="13 বছরের কম বয়সী কারো বাবা -মা, অথবা"
replace eligibility="A parent of someone under 13 years old" if eligibility=="والد لشخص أقل من 13 عامًا"
replace eligibility="A parent of someone under 13 years old" if eligibility=="孩子未满13岁"

replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="Tiene más de 18 años, usted o su pareja están esperando un bebé"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="Gé de plus de 18 ans, vous ou votre partenaire attendez un bébé"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="Niko over 18 na mimi ama partner wangu niko/ako pregnant"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="18 वर्ष से अधिक और आप या आपका साथी एक बच्चे की उम्मीद कर रहे हैं"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="میری عمر 18 سال سے زیادہ ہے اور میں ، یا میرا ساتھی بچے کی توقع کر رہا ہے۔"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="Hai più di 18 anni e tu e il tuo partner aspettate un bambino"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="18 বছর বয়সে, আপনি বা আপনার সঙ্গী একটি শিশুর প্রত্যাশা করছেন"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="إذا كان عمرك أكثر من 18 عامًا وكنت أنت أو شريكك في انتظار مولود"
replace eligibility="Aged over 18, you or your partner are expecting a baby" if eligibility=="18岁以上的孕妇（或伴侣）"

replace eligibility="None of these describe me" if eligibility=="Ninguno de estos me describe"
replace eligibility="None of these describe me" if eligibility=="Aucun de ceux-ci ne me ressemble"
replace eligibility="None of these describe me" if eligibility=="Hakuna kati ya hizi inani-represent"
replace eligibility="None of these describe me" if eligibility=="इनमें से कोई भी मेरा वर्णन नहीं करता"
replace eligibility="None of these describe me" if eligibility=="ان میں سے کوئی بھی اختیار مجھے بیان نہیں کرتا۔"
replace eligibility="None of these describe me" if eligibility=="Nessuna delle opzioni precedenti"
replace eligibility="None of these describe me" if eligibility=="এই বিকল্পগুলির কোনটিই আমাকে বর্ণনা করে না"
replace eligibility="None of these describe me" if eligibility=="لا شيء من هؤلاء يصفني"
replace eligibility="None of these describe me" if eligibility=="以上都不符合"
replace eligibility="missing" if eligibility==""

generate eligible="eligible" if eligibility=="Aged between 13-25 years old"
replace eligible="eligible" if eligibility=="Aged over 18, you or your partner are expecting a baby"
replace eligible="eligible" if eligibility=="A parent of someone under 13 years old"
replace eligible="not eligible" if eligibility=="None of these describe me"
replace eligible="missing" if eligibility=="missing"
replace eligible="not eligible" if age>25 & preg=="not pregnant"

gen nonconsent="missing" if consent1=="" & eligible=="eligible"
replace nonconsent="missing" if consent2=="" & eligible=="eligible"
replace nonconsent="missing" if consent3=="" & eligible=="eligible"
replace nonconsent="missing" if consent4=="" & eligible=="eligible"
replace nonconsent="missing" if consent5=="" & eligible=="eligible"

tab eligibility eligible

gen target_population="young person" if eligibility=="Aged between 13-25 years old"
replace target_population="Parent or expecting" if eligibility=="A parent of someone under 13 years old" | eligibility=="Aged over 18, you or your partner are expecting a baby"

//Nonconsent

replace nonconsent="C1_participation is voluntary" if consent1=="No, I don't want to complete the survey" 
replace nonconsent="C1_participation is voluntary" if consent1=="No, no quiero completar la encuesta" 
replace nonconsent="C1_participation is voluntary" if consent1=="Non, je ne veux pas répondre au sondage" 
replace nonconsent="C1_participation is voluntary" if consent1=="Hapana, sitaki kumaliza survey" 
replace nonconsent="C1_participation is voluntary" if consent1=="नहीं, मैं सर्वेक्षण पूरा नहीं करना चाहता" 
replace nonconsent="C1_participation is voluntary" if consent1=="نہیں ، میں سروے مکمل نہیں کرنا چاہتا۔" 
replace nonconsent="C1_participation is voluntary" if consent1=="No, non voglio completare il sondaggio" 
replace nonconsent="C1_participation is voluntary" if consent1=="না, আমি জরিপ সম্পূর্ণ করতে চাই না" 
replace nonconsent="C1_participation is voluntary" if consent1=="لا ، لا أرغب في إكمال الاستبيان" 
replace nonconsent="C1_participation is voluntary" if consent1=="不，我不想参加调查" 

replace nonconsent="C2_your answers are anonymous" if consent2=="No, I don't want to complete the survey" 
replace nonconsent="C2_your answers are anonymous" if consent2=="No, no quiero completar la encuesta" 
replace nonconsent="C2_your answers are anonymous" if consent2=="Non, je ne veux pas répondre au sondage" 
replace nonconsent="C2_your answers are anonymous" if consent2=="Hapana, sitaki kumaliza survey" 
replace nonconsent="C2_your answers are anonymous" if consent2=="नहीं, मैं सर्वेक्षण पूरा नहीं करना चाहता" 
replace nonconsent="C2_your answers are anonymous" if consent2=="نہیں ، میں سروے مکمل نہیں کرنا چاہتا۔" 
replace nonconsent="C2_your answers are anonymous" if consent2=="No, non voglio completare il sondaggio" 
replace nonconsent="C2_your answers are anonymous" if consent2=="না, আমি জরিপ সম্পূর্ণ করতে চাই না" 
replace nonconsent="C2_your answers are anonymous" if consent2=="لا ، لا أرغب في إكمال الاستبيان" 
replace nonconsent="C2_your answers are anonymous" if consent2=="不，我不想参加调查" 

replace nonconsent="C3_we will write research papers etc." if consent3=="No, I don't want to complete the survey" 
replace nonconsent="C3_we will write research papers etc." if consent3=="No, no quiero completar la encuesta" 
replace nonconsent="C3_we will write research papers etc." if consent3=="Non, je ne veux pas répondre au sondage" 
replace nonconsent="C3_we will write research papers etc." if consent3=="Hapana, sitaki kumaliza survey" 
replace nonconsent="C3_we will write research papers etc." if consent3=="नहीं, मैं सर्वेक्षण पूरा नहीं करना चाहता" 
replace nonconsent="C3_we will write research papers etc." if consent3=="نہیں ، میں سروے مکمل نہیں کرنا چاہتا۔" 
replace nonconsent="C3_we will write research papers etc." if consent3=="No, non voglio completare il sondaggio" 
replace nonconsent="C3_we will write research papers etc." if consent3=="না, আমি জরিপ সম্পূর্ণ করতে চাই না" 
replace nonconsent="C3_we will write research papers etc." if consent3=="لا ، لا أرغب في إكمال الاستبيان" 
replace nonconsent="C3_we will write research papers etc." if consent3=="不，我不想参加调查" 

replace nonconsent="C4_your data is secure" if consent4=="No, I don't want to complete the survey" 
replace nonconsent="C4_your data is secure" if consent4=="No, no quiero completar la encuesta" 
replace nonconsent="C4_your data is secure" if consent4=="Non, je ne veux pas répondre au sondage" 
replace nonconsent="C4_your data is secure" if consent4=="Hapana, sitaki kumaliza survey" 
replace nonconsent="C4_your data is secure" if consent4=="नहीं, मैं सर्वेक्षण पूरा नहीं करना चाहता" 
replace nonconsent="C4_your data is secure" if consent4=="نہیں ، میں سروے مکمل نہیں کرنا چاہتا۔" 
replace nonconsent="C4_your data is secure" if consent4=="No, non voglio completare il sondaggio" 
replace nonconsent="C4_your data is secure" if consent4=="না, আমি জরিপ সম্পূর্ণ করতে চাই না" 
replace nonconsent="C4_your data is secure" if consent4=="لا ، لا أرغب في إكمال الاستبيان" 
replace nonconsent="C4_your data is secure" if consent4=="不，我不想参加调查" 

replace nonconsent="C5_no prize" if consent5=="No, I don't want to complete the survey" 
replace nonconsent="C5_no prize" if consent5=="No, no quiero completar la encuesta" 
replace nonconsent="C5_no prize" if consent5=="Non, je ne veux pas répondre au sondage" 
replace nonconsent="C5_no prize" if consent5=="Hapana, sitaki kumaliza survey" 
replace nonconsent="C5_no prize" if consent5=="नहीं, मैं सर्वेक्षण पूरा नहीं करना चाहता" 
replace nonconsent="C5_no prize" if consent5=="نہیں ، میں سروے مکمل نہیں کرنا چاہتا۔" 
replace nonconsent="C5_no prize" if consent5=="No, non voglio completare il sondaggio" 
replace nonconsent="C5_no prize" if consent5=="না, আমি জরিপ সম্পূর্ণ করতে চাই না" 
replace nonconsent="C5_no prize" if consent5=="لا ، لا أرغب في إكمال الاستبيان" 
replace nonconsent="C5_no prize" if consent5=="不，我不想参加调查" 

replace nonconsent="consented" if nonconsent==""

generate consent="nonconsent" if nonconsent=="C1_participation is voluntary" | nonconsent=="C2_your answers are anonymous" | nonconsent=="C3_we will write research papers etc." | nonconsent=="C4_your data is secure" | nonconsent=="C5_no prize" | nonconsent=="C5_no prize"
replace consent="consented" if nonconsent=="consented"
replace consent="missing" if nonconsent=="missing"

tab nonconsent consent

//Time to completion 

//remove everything up to and including the last space (greedy match)gen var1 = regexr(Submit_Date,".+ ","")

gen submit_time=clock(Submit_Date, "YMD hms")
format submit_time %tc

gen start_time=clock(Start_Date, "YMD hms")
format start_time %tc

gen completion_time_seconds = seconds(submit_time - start_time)
gen completion_time_minutes = minutes(submit_time - start_time)

list Start_Date Submit_Date if completion_time_seconds==0
//hand calculate the time of completion
gen submit_hms=regexr(Submit_Date,".+ ","") if completion_time_minutes==0
gen start_hms=regexr(Start_Date,".+ ","") if completion_time_minutes==0

gen submit_hms_time=clock(submit_hms, "hms")
gen start_hms_time=clock(start_hms, "hms")

format submit_hms_time %tc
format start_hms_time %tc

gen completion_time_seconds_missing = seconds(submit_hms_time-start_hms_time)

gen completion_time_minutes_missing = minutes(submit_hms_time-start_hms_time)

replace completion_time_seconds=completion_time_seconds_missing if completion_time_seconds==0

replace completion_time_minutes=completion_time_minutes_missing if completion_time_minutes==0

drop completion_time_minutes_missing 

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\2_complete dataset.dta", replace

********************Making Final Dataset 
use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\2_complete dataset.dta", clear

gen submit_date = dofc(submit_time)
drop if submit_date<22509
drop submit_date
//n=32 (24 of these were utm_campaign=="CCC Youth Survey - Test Campaign V2", 2 were "xxxxx", 68 were pretest responses)


drop if eligibility=="A parent of someone under 13 years old" & age>25 & preg=="not pregnant" 
//n=523

drop if eligible=="missing" 
//n=79

drop if eligible=="not eligible" 
//n=2,829

drop if consent=="missing"
//n= 45



drop if consent=="nonconsent"
//n=1,698

drop if FAQs=="Yes - get in touch" | FAQs=="Sí, ponme en contacto" | FAQs=="Oui - contactez-nous" | FAQs=="Ndio - Wasiliana na sisi" | FAQs=="हाँ - संपर्क करें" | FAQs=="ہاں - رابطہ کریں۔"| FAQs== "Sì, contattaci" | FAQs=="হ্যাঁ - যোগাযোগ করুন" | FAQs=="نعم - تواصل معنا" | FAQs=="是-联系我们"
//n=560



save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\3_complete eligible consenting dataset.dta", replace


************** Must save iteratively to create a dataset for each: ineligible, nonconsent, and complete eligible and consenting.

//Test campaign
use "2_complete dataset.dta", clear 
keep if utm_campaign=="CCC Youth Survey - Test Campaign V2"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Missing data\responses from test campaign.dta", replace


//Not eligible
use "2_complete dataset.dta", clear 
keep if eligible=="not eligible" | eligible=="missing"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Missing data\ineligible dataset.dta", replace

//if nonconsent=="consented" they DID consent!  
use "2_complete dataset.dta", clear 
drop if nonconsent=="consented"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Missing data\eligible nonconsent dataset.dta", replace

//FAQs kicked them out
use "2_complete dataset.dta", clear 
keep if FAQs=="Yes - get in touch" | FAQs=="Sí, ponme en contacto" | FAQs=="Oui - contactez-nous" | FAQs=="Ndio - Wasiliana na sisi" | FAQs=="हाँ - संपर्क करें" | FAQs=="ہاں - رابطہ کریں۔"| FAQs== "Sì, contattaci" | FAQs=="হ্যাঁ - যোগাযোগ করুন" | FAQs=="نعم - تواصل معنا" | FAQs=="是-联系我们"
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Missing data\FAQs noncomplete.dta", replace

//>25 not pregnant kicked them out
use "2_complete dataset.dta", clear 
keep if age>25 & preg=="not pregnant" 
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Missing data\25+ not pregnant", replace

*************Cleaning survey questions
use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\3_complete eligible consenting dataset.dta", clear

//air_quality
gen air_quality_real=real(air_quality)


//best
gen best_a1_vote="no vote" if best_a1==""
gen best_a2_vote="no vote" if best_a2==""
gen best_a3_vote="no vote" if best_a3==""
gen best_a4_vote="no vote" if best_a4==""
gen best_a5_vote="no vote" if best_a5==""
gen best_a6_vote="no vote" if best_a6==""
gen best_a7_vote="no vote" if best_a7==""
gen best_a8_vote="no vote" if best_a8==""
gen best_a9_vote="no vote" if best_a9==""
gen best_a10_vote="no vote" if best_a10==""
gen best_a11_vote="no vote" if best_a11==""
gen best_a12_vote="no vote" if best_a12==""

replace best_a1_vote="vote" if best_a1_vote==""
replace best_a2_vote="vote" if best_a2_vote==""
replace best_a3_vote="vote" if best_a3_vote==""
replace best_a4_vote="vote" if best_a4_vote==""
replace best_a5_vote="vote" if best_a5_vote==""
replace best_a6_vote="vote" if best_a6_vote==""
replace best_a7_vote="vote" if best_a7_vote==""
replace best_a8_vote="vote" if best_a8_vote==""
replace best_a9_vote="vote" if best_a9_vote==""
replace best_a10_vote="vote" if best_a10_vote==""
replace best_a11_vote="vote" if best_a11_vote==""
replace best_a12_vote="vote" if best_a12_vote==""

gen best_a1_count=1 if best_a1_vote=="vote"
gen best_a2_count=2 if best_a2_vote=="vote"
gen best_a3_count=3 if best_a3_vote=="vote"
gen best_a4_count=4 if best_a4_vote=="vote"
gen best_a5_count=5 if best_a5_vote=="vote"
gen best_a6_count=6 if best_a6_vote=="vote"
gen best_a7_count=7 if best_a7_vote=="vote"
gen best_a8_count=8 if best_a8_vote=="vote"
gen best_a9_count=9 if best_a9_vote=="vote"
gen best_a10_count=10 if best_a10_vote=="vote"
gen best_a11_count=11 if best_a11_vote=="vote"
gen best_a12_count=12 if best_a12_vote=="vote"

label variable best_a1_count "The people"
label variable best_a2_count "Places to play"
label variable best_a3_count "Being close to school or work"
label variable best_a4_count "There are many things to do"
label variable best_a5_count "The shops and restaurants"
label variable best_a6_count "It's easy to get around"
label variable best_a7_count "Healthcare"
label variable best_a8_count "Being close to my family"
label variable best_a9_count "The climate and environment"
label variable best_a10_count "Work opportunities for my family"
label variable best_a11_count "Access to green space, like parks"
label variable best_a12_count "Other"

label variable best_a1_vote "The people"
label variable best_a2_vote "Places to play"
label variable best_a3_vote "Being close to school or work"
label variable best_a4_vote "There are many things to do"
label variable best_a5_vote "The shops and restaurants"
label variable best_a6_vote "It's easy to get around"
label variable best_a7_vote "Healthcare"
label variable best_a8_vote "Being close to my family"
label variable best_a9_vote "The climate and environment"
label variable best_a10_vote "Work opportunities for my family"
label variable best_a11_vote "Access to green space, like parks"
label variable best_a12_vote "Other"



//Worst

gen worst_a1_vote="no vote" if worst_a1==""
gen worst_a2_vote="no vote" if worst_a2==""
gen worst_a3_vote="no vote" if worst_a3==""
gen worst_a4_vote="no vote" if worst_a4==""
gen worst_a5_vote="no vote" if worst_a5==""
gen worst_a6_vote="no vote" if worst_a6==""
gen worst_a7_vote="no vote" if worst_a7==""
gen worst_a8_vote="no vote" if worst_a8==""
gen worst_a9_vote="no vote" if worst_a9==""
gen worst_a10_vote="no vote" if worst_a10==""
gen worst_a11_vote="no vote" if worst_a11==""
gen worst_a12_vote="no vote" if worst_a12==""
gen worst_a13_vote="no vote" if worst_a13==""

replace worst_a1_vote="vote" if worst_a1_vote==""
replace worst_a2_vote="vote" if worst_a2_vote==""
replace worst_a3_vote="vote" if worst_a3_vote==""
replace worst_a4_vote="vote" if worst_a4_vote==""
replace worst_a5_vote="vote" if worst_a5_vote==""
replace worst_a6_vote="vote" if worst_a6_vote==""
replace worst_a7_vote="vote" if worst_a7_vote==""
replace worst_a8_vote="vote" if worst_a8_vote==""
replace worst_a9_vote="vote" if worst_a9_vote==""
replace worst_a10_vote="vote" if worst_a10_vote==""
replace worst_a11_vote="vote" if worst_a11_vote==""
replace worst_a12_vote="vote" if worst_a12_vote==""
replace worst_a13_vote="vote" if worst_a13_vote==""

gen worst_a1_count=1 if worst_a1_vote=="vote"
gen worst_a2_count=2 if worst_a2_vote=="vote"
gen worst_a3_count=3 if worst_a3_vote=="vote"
gen worst_a4_count=4 if worst_a4_vote=="vote"
gen worst_a5_count=5 if worst_a5_vote=="vote"
gen worst_a6_count=6 if worst_a6_vote=="vote"
gen worst_a7_count=7 if worst_a7_vote=="vote"
gen worst_a8_count=8 if worst_a8_vote=="vote"
gen worst_a9_count=9 if worst_a9_vote=="vote"
gen worst_a10_count=10 if worst_a10_vote=="vote"
gen worst_a11_count=11 if worst_a11_vote=="vote"
gen worst_a12_count=12 if worst_a12_vote=="vote"
gen worst_a13_count=13 if worst_a13_vote=="vote"

label variable worst_a1_vote "It doesn't always feel safe"
label variable worst_a2_vote "The people; it's not friendly"
label variable worst_a3_vote "The traffic/congestion"
label variable worst_a4_vote "The noise"
label variable worst_a5_vote "Not enough places to play"
label variable worst_a6_vote "Not enough places to meet friends"
label variable worst_a7_vote "Shortage of work opportunities for my family"
label variable worst_a8_vote "It's hard to get around"
label variable worst_a9_vote "Not easy enough to get healthcare"
label variable worst_a10_vote "The pollution"
label variable worst_a11_vote "It's too crowded"
label variable worst_a12_vote "Not enough green space like parks"
label variable worst_a13_vote "Other"

label variable worst_a1_count "It doesn't always feel safe"
label variable worst_a2_count "The people; it's not friendly"
label variable worst_a3_count "The traffic/congestion"
label variable worst_a4_count "The noise"
label variable worst_a5_count "Not enough places to play"
label variable worst_a6_count "Not enough places to meet friends"
label variable worst_a7_count "Shortage of work opportunities for my family"
label variable worst_a8_count "It's hard to get around"
label variable worst_a9_count "Not easy enough to get healthcare"
label variable worst_a10_count "The pollution"
label variable worst_a11_count "It's too crowded"
label variable worst_a12_count "Not enough green space like parks"
label variable worst_a13_count "Other"



//Pollution

gen pollution_causes_a1_vote="no vote" if pollution_causes_a1==""
gen pollution_causes_a2_vote="no vote" if pollution_causes_a2==""
gen pollution_causes_a3_vote="no vote" if pollution_causes_a3==""
gen pollution_causes_a4_vote="no vote" if pollution_causes_a4==""
gen pollution_causes_a5_vote="no vote" if pollution_causes_a5==""
gen pollution_causes_a6_vote="no vote" if pollution_causes_a6==""
gen pollution_causes_a7_vote="no vote" if pollution_causes_a7==""
gen pollution_causes_a8_vote="no vote" if pollution_causes_a8==""
gen pollution_causes_a9_vote="no vote" if pollution_causes_a9==""

replace pollution_causes_a1_vote="vote" if pollution_causes_a1_vote==""
replace pollution_causes_a2_vote="vote" if pollution_causes_a2_vote==""
replace pollution_causes_a3_vote="vote" if pollution_causes_a3_vote==""
replace pollution_causes_a4_vote="vote" if pollution_causes_a4_vote==""
replace pollution_causes_a5_vote="vote" if pollution_causes_a5_vote==""
replace pollution_causes_a6_vote="vote" if pollution_causes_a6_vote==""
replace pollution_causes_a7_vote="vote" if pollution_causes_a7_vote==""
replace pollution_causes_a8_vote="vote" if pollution_causes_a8_vote==""
replace pollution_causes_a9_vote="vote" if pollution_causes_a9_vote==""

gen pollution_causes_a1_count=1 if pollution_causes_a1_vote=="vote"
gen pollution_causes_a2_count=2 if pollution_causes_a2_vote=="vote"
gen pollution_causes_a3_count=3 if pollution_causes_a3_vote=="vote"
gen pollution_causes_a4_count=4 if pollution_causes_a4_vote=="vote"
gen pollution_causes_a5_count=5 if pollution_causes_a5_vote=="vote"
gen pollution_causes_a6_count=6 if pollution_causes_a6_vote=="vote"
gen pollution_causes_a7_count=7 if pollution_causes_a7_vote=="vote"
gen pollution_causes_a8_count=8 if pollution_causes_a8_vote=="vote"
gen pollution_causes_a9_count=9 if pollution_causes_a9_vote=="vote"


label variable pollution_causes_a1_vote "Factories"
label variable pollution_causes_a2_vote "Burning of rubbish"
label variable pollution_causes_a3_vote "Motor transport (cars, buses, lorries)"
label variable pollution_causes_a4_vote "Construction/building work"
label variable pollution_causes_a5_vote "Pollution blown into the city (from outside)"
label variable pollution_causes_a6_vote "Household cooking (cooking fires/stoves)"
label variable pollution_causes_a7_vote "Household heating (boilers, wood fires etc)"
label variable pollution_causes_a8_vote "Agriculture/farming"
label variable pollution_causes_a9_vote "Other"

label variable pollution_causes_a1_count "Factories"
label variable pollution_causes_a2_count "Burning of rubbish"
label variable pollution_causes_a3_count "Motor transport (cars, buses, lorries)"
label variable pollution_causes_a4_count "Construction/building work"
label variable pollution_causes_a5_count "Pollution blown into the city (from outside)"
label variable pollution_causes_a6_count "Household cooking (cooking fires/stoves)"
label variable pollution_causes_a7_count "Household heating (boilers, wood fires etc)"
label variable pollution_causes_a8_count "Agriculture/farming"
label variable pollution_causes_a9_count "Other"


//Nicer city

gen nicer_city_count=1 if nicer_city=="Becoming a nicer place to live"| nicer_city=="Convertirse en un mejor lugar para vivir" | nicer_city=="Deviens un endroit plus agréable à vivre"| nicer_city=="Inakuwa place poa ya kuishi" | nicer_city=="रहने के लिए एक अच्छी जगह बनना"| nicer_city=="رہنے کے لئے اچھی جگہ بن چکا ہے۔" | nicer_city=="Deviens un endroit plus agréable à vivre"| nicer_city=="বসবাসের জন্য একটি সুন্দর জায়গা হয়ে উঠছে"| nicer_city=="أن تصبح مكانًا أجمل للعيش فيه"| nicer_city=="成为更宜居的地方"

replace nicer_city_count=2 if nicer_city=="Staying the same" | nicer_city=="Permanecer igual" | nicer_city=="Rester le même"| nicer_city=="Imebaki the same" | nicer_city=="वही रहना"| nicer_city=="ابھی تک ویسا ہی ہے۔" | nicer_city=="Deviens un endroit plus agréable à vivre"| nicer_city=="একই থাকছে"| nicer_city=="البقاء على نفس"| nicer_city=="保持不变"

replace nicer_city_count=3 if nicer_city=="Becoming a worse place to live" | nicer_city=="Convertirse en un peor lugar para vivir" | nicer_city=="Deviens un pire endroit où vivre"| nicer_city=="Inakuwa place mbaya ya kuishi" | nicer_city=="रहने के लिए एक बदतर जगह बनना"| nicer_city=="پہلے کے مقابلے میں اور ہی بدتر جگہ ہو چکا ہے۔" | nicer_city=="Deviens un endroit plus agréable à vivre"| nicer_city=="বসবাসের জন্য একটি খারাপ জায়গা হয়ে উঠছে"| nicer_city=="أن تصبح مكانًا أسوأ للعيش فيه"| nicer_city=="变得更糟糕的居住地"

label define nicer_city_label 1 "Becoming a nicer place to live" 2 "Staying the same" 3 "Becoming a worse place to live"
label values nicer_city_count nicer_city_label


save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\4_Full cleaned dataset", replace
clear




//Creating quartiles of pm2.5 within WHO dataset 
import excel "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\External pm2.5 Datasets\WHO_2018_v14_aap_air_quality_database.xlsx", sheet("latest availble PM25 (measured)") firstrow clear

rename PM25Annualmeanugm3 city_pm25
rename town city

keep city region city_pm25
replace city="Milan" if city=="Milano"

xtile city_pm25_quartile = city_pm25, nq(4) //Create quartiles by average annual pm2.5 at the city level

cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Merge with external pm2.5 datasets"

save "WHO PM 2.5 dataset by city.dta", replace


//City level merge of WHO pm2.5 quintiles with survey data
merge 1:m city region using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\4_Full cleaned dataset"
drop if _merge==1
drop _merge

tab city_pm25_quartile // to make "Percentage of Observed Cities by pm2.5 Quintile" graph
tab city_pm25_quartile, summarize(city_pm25) // to make average city_pm25 by quintile
//bysort city_pm25_quintile : outreg2 using "average city pm25 by quintile.doc", replace sum(log) keep(city_pm25) eqdrop(min max)

save "Merged with 2.5 city.dta", replace

//Creating country level pm2.5 quintiles 
use "WHO PM 2.5 dataset by city.dta", clear

bysort region : egen region_pm25=mean(city_pm25)
by region, sort: gen nvals = _n == 1
drop if nvals==0
drop nvals

xtile region_pm25_quartile = region_pm25, nq(4)

drop city city_pm25 city_pm25_quartile


//merge of WHO pm2.5 quartiles with suvey data (to fill in where cities were not available in the WHO dataset)
merge 1:m region using "Merged with 2.5 city.dta"
drop if _merge==1
tab region if _merge==2
drop _merge

tab region_pm25_quartile
tab region_pm25_quartile, summarize(region_pm25) 
save "Merged with 2.5 city and country.dta", replace

tab region if region_pm25_quartile==.

/*
Need to use the World bank numbers for these countries because they didn't appear in the WHO dataset

Will need to add these country average pm2.5 into the WHO dataset and recalculate the average pm2.5 quartile								  
*/


//Extracting average country pm2.5 from World Bank Dataset

import delimited "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\External pm2.5 Datasets\World_Bank_database.csv", varnames (1) clear
keep datasource v62
drop in 1/2
rename datasource region
rename v62 region_pm25
keep if region=="Sierra Leone" | region=="Tanzania" | region=="Zimbabwe"
save "Countries missing pm2.5.dta", replace

//Appending World Bank's country level average annual pm2.5 for countries excluded from WHO dataset TO the WHO dataset
use "Countries missing pm2.5.dta", clear
append using "WHO PM 2.5 dataset by city.dta"

//Redo the quartiles with included World Bank country information//


bysort region : egen region_pm25_temp=mean(city_pm25)
replace region_pm25=region_pm25_temp if region_pm25==.
drop region_pm25_temp
xtile region_pm25_quartile = region_pm25, nq(4)

order city_pm25 city_pm25_quartile region_pm25 region_pm25_quartile

save "WHO+World Bank PM 2.5 dataset by city.dta", replace

//City level merge of WHO+World Bank pm2.5 quartiles with survey data
merge 1:m city region using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\4_Full cleaned dataset"
drop if _merge==1
drop _merge
save "Merged with 2.5 city.dta (version 2)", replace

//Country level merge of WHO+World Bank pm2.5 quartiles with suvey data (to fill in where cities were not available in the WHO dataset)
use "WHO+World Bank PM 2.5 dataset by city.dta", clear
by region, sort: gen nvals = _n == 1
drop if nvals==0
drop city city_pm25 city_pm25_quartile nvals

merge 1:m region using "Merged with 2.5 city.dta (version 2)"
drop if _merge==1
drop _merge

generate pm25=city_pm25
replace pm25=region_pm25 if pm25==.

generate pm25_quartile=city_pm25_quartile
replace pm25_quartile=region_pm25_quartile if pm25_quartile==.

//Respondents by pm2.5 quartile and descriptive stats
tab pm25_quartile // to make "Percentage of Observed Cities by pm2.5 Quartile" graph
tab pm25_quartile, summarize(pm25) // to make average city_pm25 by quartile

bysort city: tab pm25_quartile, summarize (pm25)

//Dropping missing demographic location & age
drop if age==.
drop if region_pm25==.

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\5_Full sample with pm2.5.dta", replace


//city by pm2.5 quartile and descriptive stats
by city, sort: gen nvals = _n == 1
drop if nvals==0
tab pm25_quartile
tab pm25_quartile, summarize(pm25)


*************FINAL DATASET
use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\5_Full sample with pm2.5.dta", clear
clear


cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets"
use  "5_Full sample with pm2.5.dta", clear


***********************Subpopulation Datasets
use  "5_Full sample with pm2.5.dta", clear
drop if best_a1=="" & best_a2=="" & best_a3=="" & best_a4=="" & best_a5=="" & best_a6=="" & best_a7=="" & best_a8=="" & best_a9=="" & best_a10==""
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations\Best subpopulation.dta", replace

use  "5_Full sample with pm2.5.dta", clear
drop if worst_a1=="" & worst_a2=="" & worst_a3=="" & worst_a4=="" & worst_a5=="" & worst_a6=="" & worst_a7=="" & worst_a8=="" & worst_a9=="" & worst_a10=="" & worst_a11=="" & worst_a12=="" & worst_a13==""
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations\Worst subpopulation.dta", replace

use  "5_Full sample with pm2.5.dta", clear
drop if pollution_causes_a1=="" & pollution_causes_a2=="" & pollution_causes_a3=="" & pollution_causes_a4=="" & pollution_causes_a5=="" & pollution_causes_a6=="" & pollution_causes_a7=="" & pollution_causes_a8=="" & pollution_causes_a9==""
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations\Pollution subpopulation.dta", replace

use  "5_Full sample with pm2.5.dta", clear
drop if nicer_city==""
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations\Nicer City subpopulation.dta", replace

use  "5_Full sample with pm2.5.dta", clear
drop if air_quality_real==.
save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations\Air Quality subpopulation.dta", replace

****************************Reshaped Datasets
use  "5_Full sample with pm2.5.dta", clear


//Analyzing Best, Worst, Pollution 
keep ID lang age age_bucket town best_a1_count-best_a12_count worst_a1_count-worst_a13_count pollution_causes_a1_count-pollution_causes_a9_count
sort ID
quietly by ID :  gen dup = cond(_N==1,0,_n)

reshape long best worst pollution, i(ID) j(question, string)

label define bestlabel 1 "The people" 2 "Places to play" 3 "Being close to school or work" 4 "There are many things to do" 5 "The shops and restaurants" 6 "It's easy to get around" 7 "Healthcare" 8 "Being close to my family" 9 "The climate and environment" 10 "Work opportunities for my family" 11 "Access to green space, like parks" 12 "Other"
label values best bestlabel

label define worstlabel 1 "It doesn't always feel safe" 2 "The people; it's not friendly" 3 "The traffic/congestion" 4 "The noise" 5 "Not enough places to play" 6  "Not enough places to meet friends" 7 "Shortage of work opportunities for my family" 8 "It's hard to get around" 9 "Not easy enough to get healthcare" 10 "The pollution" 11 "It's too crowded" 12 "Not enough green space like parks" 13 "Other"
label values worst worstlabel

label define pollutionlabel 1 "Factories" 2 "Burning of rubbish" 3 "Motor transport (cars, buses, lorries)" 4 "Construction/building work" 5 "Pollution blown into the city (from outside)" 6 "Household cooking (cooking fires/stoves)" 7 "Household heating (boilers, wood fires etc)" 8 "Agriculture/farming" 9 "Other"
label values pollution pollutionlabel

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Reshaped dataset for best_worst_pollution.dta", replace

tab best
tab worst
tab pollution


//Generating Qual dataset
use  "5_Full sample with pm2.5.dta", clear

keep ID eligibility age age_bucket preg town region best_a1_vote-best_a11_vote best_a12 worst_a1_vote-worst_a12_vote worst_a13 gen_city_comments air_quality pollution_causes_a1_vote-pollution_causes_a8_vote pollution_causes_a9  gen_air_comments nicer_city_count nicer_comments mayor_ans1 mayor_ans2 final_comments pm25 pm25_quartile submit_time

drop if best_a12=="" & worst_a13=="" & gen_city_comments=="" & pollution_causes_a9=="" &  gen_air_comments==""  &  nicer_comments=="" &  mayor_ans1=="" &  mayor_ans2=="" &  final_comments==""

sort best_a12 worst_a13 gen_city_comments pollution_causes_a9 gen_air_comments nicer_comments mayor_ans1 mayor_ans2 final_comments 

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\qualitative dataset.dta", replace

export excel region-pm25_quartile using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Qual Data with pm25.xls", replace

/*

//Adding in new qual!!
use "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\qualitative dataset.dta", clear
gen newvar=1
sort ID
order ID
by ID, sort : generate dup = _n == 1
sort newvar
order ID dup newvar
drop if dup==0 & newvar==1
drop if newvar==.
drop dup newvar

order region pm25 town ID eligibility preg best_a12 worst_a13 gen_city_comments air_quality pollution_causes_a9 gen_air_comments nicer_comments mayor_ans1 mayor_ans2 final_comments age age_bucket best_a1_vote-best_a11_vote worst_a1_vote-worst_a12_vote pollution_causes_a1_vote-pollution_causes_a8_vote nicer_city_count pm25_quintile

export excel region-pm25_quintile using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Qual Data with pm25 NEW.xls", replace
*/

clear


*******************ADDING IN James's data
cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC\Important documents for CCC survey methods slide deck\CCC Analysis"

import excel "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC\Important documents for CCC survey methods slide deck\CCC Analysis\CCC_city_indicators.xlsx", sheet("Cities") firstrow clear

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Country Level\CCC_city_indicators.dta", replace

rename City city

merge 1:m city using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\5_Full sample with pm2.5.dta"

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\CCC\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\5_Full sample with pm2.5.dta", replace

ssc install outreg2


//To pull down all figures for final report

cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets"
use  "5_Full sample with pm2.5.dta", clear

drop region_pm25 region_pm25_quartile city_pm25 city_pm25_quartile live eligible q FAQs variable_0 consent1 consent2 consent3 consent4 consent5 consent mistake dup nonconsent lang utm_source utm_medium utm_campaign utm_campaign_location utm_content Start_Date Submit_Date submit_time start_time Network_ID submit_hms start_hms submit_hms_time start_hms_time completion_time_minutes completion_time_seconds completion_time_seconds_missing town

drop best_a1-best_a11 best*count worst_a1-worst_a12 worst*count pollution_causes_a1-pollution_causes_a8 pollution*count nicer_city air_quality

order ID geoscheme_region geoscheme_subregion OECD city region pm25 pm25_quartile eligibility target_population age age_bucket preg nicer_city_count air_quality_real best_a1_vote-best_a12_vote worst_a1_vote-worst_a13_vote pollution_causes_a1_vote-pollution_causes_a9_vote best_a12 worst_a13 pollution_causes_a9 nicer_comments gen_city_comments gen_air_comments final_comments

save "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\7_anonymized data.dta", replace

export excel using "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Anonymized data.xls", firstrow(variables) replace


*****************Full sample descriptive univariates
//Age

//ages that are in date (year) format were converted to the age in years the respondent would be, at the time of analysis (September 13, 2021) if they were born mid-year (June, 1) of the reported year

tab age
tab age_bucket
su age, detail

//Target population
tab eligibility

//Median Completion Time
su completion_time_minutes, detail

//Geographic location by pm25_quartile
encode city, gen(city_encoded)
encode region, gen(region_encoded)
tab city
tab region

//outreg2 town using "demographics table.doc", append cross
asdoc bysort pm25_quartile : tab city_encoded, c(city_encoded) replace
by pm25_quartile, sort: tab region if city=="missing"

//PM2.5 by observation
tab pm25_quartile

tabstat pm25, stat(n mean min max sd p50) by (pm25_quartile)

//PM2.5 by city
tab city, su(pm25)
by city, sort: gen nvals = _n == 1
drop if nvals==0
tab pm25_quartile
tab pm25_quartile, summarize(pm25)

//Income by pm2.5
by city, sort: gen nvals = _n == 1
drop if nvals==0
sort pm25_quartile 
by pm25_quartile : su CityGDPpercap


***************Subpopulation 'Best'
cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations"

use "Best subpopulation.dta", clear

//Univariates
su age
//outreg2 using "demographics table.doc", append sum(log) keep(age)


tab age_bucket
tab eligibility

tabm best_a1_vote-best_a11_vote, column //Each item made up what percent of the overall votes for 'best' things? 
tabm best_a1_vote-best_a11_vote, row //What percent of people voted for each item? 

tabm best_a1_vote-best_a12_vote, column row 


//heat maps
sort pm25_quartile
by pm25_quartile : tab city
tabulate city best_a1_vote, row
tabulate city best_a2_vote, row
tabulate city best_a3_vote, row
tabulate city best_a4_vote, row
tabulate city best_a5_vote, row
tabulate city best_a6_vote, row
tabulate city best_a7_vote, row
tabulate city best_a8_vote, row
tabulate city best_a9_vote, row
tabulate city best_a10_vote, row
tabulate city best_a11_vote, row
tabulate city best_a12_vote, row

by pm25_quartile : tab region
tabulate region best_a1_vote, row
tabulate region best_a2_vote, row
tabulate region best_a3_vote, row
tabulate region best_a4_vote, row
tabulate region best_a5_vote, row
tabulate region best_a6_vote, row
tabulate region best_a7_vote, row
tabulate region best_a8_vote, row
tabulate region best_a9_vote, row
tabulate region best_a10_vote, row
tabulate region best_a11_vote, row
tabulate region best_a12_vote, row

tabulate pm25_quartile best_a1_vote, row
tabulate pm25_quartile best_a2_vote, row
tabulate pm25_quartile best_a3_vote, row
tabulate pm25_quartile best_a4_vote, row
tabulate pm25_quartile best_a5_vote, row
tabulate pm25_quartile best_a6_vote, row
tabulate pm25_quartile best_a7_vote, row
tabulate pm25_quartile best_a8_vote, row
tabulate pm25_quartile best_a9_vote, row
tabulate pm25_quartile best_a10_vote, row
tabulate pm25_quartile best_a11_vote, row
tabulate pm25_quartile best_a12_vote, row

tabulate target_population best_a1_vote, row
tabulate target_population best_a2_vote, row
tabulate target_population best_a3_vote, row
tabulate target_population best_a4_vote, row
tabulate target_population best_a5_vote, row
tabulate target_population best_a6_vote, row
tabulate target_population best_a7_vote, row
tabulate target_population best_a8_vote, row
tabulate target_population best_a9_vote, row
tabulate target_population best_a10_vote, row
tabulate target_population best_a11_vote, row
tabulate target_population best_a12_vote, row

tabulate age_bucket best_a1_vote, row
tabulate age_bucket best_a2_vote, row
tabulate age_bucket best_a3_vote, row
tabulate age_bucket best_a4_vote, row
tabulate age_bucket best_a5_vote, row
tabulate age_bucket best_a6_vote, row
tabulate age_bucket best_a7_vote, row
tabulate age_bucket best_a8_vote, row
tabulate age_bucket best_a9_vote, row
tabulate age_bucket best_a10_vote, row
tabulate age_bucket best_a11_vote, row
tabulate age_bucket best_a12_vote, row


/*The most common 'best' things were 'Being close to my family' at 14.40% of the 4,492 votes cast (with individuals recieving more than one vote),  'Being close to school' at 13.42% of the vote, and that 'There are many things to do' at 12.62% of the vote.

29.60%, 27.58%, and 25.94% of the 2,186 respondents marked these items respectively as being in the top three 'best' things about their cities. */



//Multivariates


***************Subpopulation 'Worst'
use "Worst subpopulation.dta", clear

//Univariates
su age
//tab age_bucket
tab eligibility

tabm worst_a1_vote-worst_a13_vote, column  //Each item made up what percent of the overall votes for 'worst' things? 
tabm worst_a1_vote-worst_a13_vote, row //What percent of people voted for each item? 

tabm worst_a1_vote-worst_a13_vote, column row


/*The most common 'worst' things were 'The traffic/congestion' at 21.18% of the 5,128 votes cast (with individuals recieving more than one vote),  'The pollution' at 16.63% of the vote, and 'Shortage of work opportunities for my family' at 10.92% of the vote.

47.93%, 37.64%, and 24.71% of the 2,266 respondents marked these items respectively as being in the top three 'worst' things about their cities. */


//heat maps
sort pm25_quartile
by pm25_quartile : tab city
tabulate city worst_a1_vote, row
tabulate city worst_a2_vote, row
tabulate city worst_a3_vote, row
tabulate city worst_a4_vote, row
tabulate city worst_a5_vote, row
tabulate city worst_a6_vote, row
tabulate city worst_a7_vote, row
tabulate city worst_a8_vote, row
tabulate city worst_a9_vote, row
tabulate city worst_a10_vote, row
tabulate city worst_a11_vote, row
tabulate city worst_a12_vote, row
tabulate city worst_a13_vote, row


tabulate target_population  worst_a1_vote, row
tabulate target_population  worst_a2_vote, row
tabulate target_population  worst_a3_vote, row
tabulate target_population  worst_a4_vote, row
tabulate target_population  worst_a5_vote, row
tabulate target_population  worst_a6_vote, row
tabulate target_population  worst_a7_vote, row
tabulate target_population  worst_a8_vote, row
tabulate target_population  worst_a9_vote, row
tabulate target_population  worst_a10_vote, row
tabulate target_population  worst_a11_vote, row
tabulate target_population  worst_a12_vote, row
tabulate target_population  worst_a13_vote, row


tabulate pm25_quartile   worst_a1_vote, row
tabulate pm25_quartile   worst_a2_vote, row
tabulate pm25_quartile   worst_a3_vote, row
tabulate pm25_quartile   worst_a4_vote, row
tabulate pm25_quartile   worst_a5_vote, row
tabulate pm25_quartile   worst_a6_vote, row
tabulate pm25_quartile   worst_a7_vote, row
tabulate pm25_quartile   worst_a8_vote, row
tabulate pm25_quartile   worst_a9_vote, row
tabulate pm25_quartile   worst_a10_vote, row
tabulate pm25_quartile   worst_a11_vote, row
tabulate pm25_quartile   worst_a12_vote, row
tabulate pm25_quartile   worst_a13_vote, row

tabulate age_bucket worst_a1_vote, row
tabulate age_bucket worst_a2_vote, row
tabulate age_bucket worst_a3_vote, row
tabulate age_bucket worst_a4_vote, row
tabulate age_bucket worst_a5_vote, row
tabulate age_bucket worst_a6_vote, row
tabulate age_bucket worst_a7_vote, row
tabulate age_bucket worst_a8_vote, row
tabulate age_bucket worst_a9_vote, row
tabulate age_bucket worst_a10_vote, row
tabulate age_bucket worst_a11_vote, row
tabulate age_bucket worst_a12_vote, row
tabulate age_bucket worst_a13_vote, row


***************Subpopulation 'Pollution'
cd "C:\Users\rache\OneDrive - London School of Hygiene and Tropical Medicine\Important documents for CCC survey methods slide deck\CCC Analysis\Built Datasets\Subpopulations"
use "Pollution subpopulation.dta", clear

su age
//tab age_bucket
tab eligibility

tabm pollution_causes_a1_vote-pollution_causes_a9_vote, column  //Each item made up what percent of the overall votes for 'worst' things? 
tabm pollution_causes_a1_vote-pollution_causes_a9_vote, row //What percent of people voted for each item? 

tabm pollution_causes_a1_vote-pollution_causes_a9_vote, column row


sort pm25_quartile
by pm25_quartile : tab city
tabulate city pollution_causes_a1_vote, row
tabulate city pollution_causes_a2_vote, row
tabulate city pollution_causes_a3_vote, row
tabulate city pollution_causes_a4_vote, row
tabulate city pollution_causes_a5_vote, row
tabulate city pollution_causes_a6_vote, row
tabulate city pollution_causes_a7_vote, row
tabulate city pollution_causes_a8_vote, row
tabulate city pollution_causes_a9_vote, row



tabulate target_population pollution_causes_a1_vote, row
tabulate target_population pollution_causes_a2_vote, row
tabulate target_population pollution_causes_a3_vote, row
tabulate target_population pollution_causes_a4_vote, row
tabulate target_population pollution_causes_a5_vote, row
tabulate target_population pollution_causes_a6_vote, row
tabulate target_population pollution_causes_a7_vote, row
tabulate target_population pollution_causes_a8_vote, row
tabulate target_population pollution_causes_a9_vote, row


tabulate pm25_quartile pollution_causes_a1_vote, row
tabulate pm25_quartile pollution_causes_a2_vote, row
tabulate pm25_quartile pollution_causes_a3_vote, row
tabulate pm25_quartile pollution_causes_a4_vote, row
tabulate pm25_quartile pollution_causes_a5_vote, row
tabulate pm25_quartile pollution_causes_a6_vote, row
tabulate pm25_quartile pollution_causes_a7_vote, row
tabulate pm25_quartile pollution_causes_a8_vote, row
tabulate pm25_quartile pollution_causes_a9_vote, row

tabulate age_bucket pollution_causes_a1_vote, row
tabulate age_bucket pollution_causes_a2_vote, row
tabulate age_bucket pollution_causes_a3_vote, row
tabulate age_bucket pollution_causes_a4_vote, row
tabulate age_bucket pollution_causes_a5_vote, row
tabulate age_bucket pollution_causes_a6_vote, row
tabulate age_bucket pollution_causes_a7_vote, row
tabulate age_bucket pollution_causes_a8_vote, row
tabulate age_bucket pollution_causes_a9_vote, row


***************Subpopulation 'air quality'
use "Air Quality subpopulation.dta", clear
su age
//tab age_bucket
tab eligibility

tabstat air_quality_real, by(pm25_quartile) stat(n mean sd)
tabstat air_quality_real, by(age_bucket) stat(n mean sd)
tabstat air_quality_real, by(target_population) stat(n mean sd)
tabstat air_quality_real, by(city) stat(n mean sd)

bysort region: outreg2 using "full sample.xls", replace sum(log) keep(air_quality_real) eqkeep(N mean sd)

bysort town: outreg2 using "full sample.xls", replace sum(log) keep(air_quality_real) eqkeep(N mean sd)

bysort OECD: outreg2 using "full sample.xls", replace sum(log) keep(air_quality_real) eqkeep(N mean sd)
***************Subpopulation 'Nicer city'
use "Nicer City subpopulation.dta", clear

su age
//tab age_bucket
tab eligibility


sort target_population
by target_population : tab nicer_city_count

sort age_bucket
by age_bucket : tab nicer_city_count
table age_bucket nicer_city_count	


tab nicer_city_count
sort pm25_quartile
by pm25_quartile : tab nicer_city_count

by OECD : tab nicer_city_count
by OECD : su pm25

clear