capture log close
args database linkage
log using "$logdir\cr_studypopulation_foraggregate_`database'_`linkage'.txt", replace text

/*******************************************************************************
CREATE STUDY POPULATION FILES FOR AGGREGATE ANALYSIS FOR EACH DATABASE / LINKAGE
= PATIENT LEVEL DATABASE WITH THE FOLLOWING VARIABLES 
patid gender yob startfup endfup died
*******************************************************************************/

/*HARMONISE VARIABLE DEFINITIONS*/
use "$datadir\cr_studypopulation_broad_`database'.dta", clear
if "`database'" == "aurum" rename cprd_ddate deathdate


/*RESTRICT TO LINKAGE ELIGIBLE POPULATION*/		
if "`linkage'" == "linked" {
	merge 1:1 patid using "$rawdatadir\\`database'_linked\\`database'_20_163R_Linkage_Eligibility_set$linkageset"
		assert _merge !=2
		count
		tab hes_e
		tab death_e
		tab lsoa_e
	keep if hes_e == 1 & death_e == 1 & lsoa_e == 1 & _merge == 3
	drop _merge hes_e death_e lsoa_e
	count
	
	/*ONS DATE OF DEATH*/
	merge 1:1 patid using "$rawdatadir\\`database'_linked\\`database'_20_163R_Death_set$linkageset", keepusing(dod)
	*assert _merge !=2	
	drop if _merge == 2 /* HC changed 26 03 2021*/
	drop _merge
	count if deathdate==. & dod!=.
	drop deathdate
	rename dod deathdate
	}


/* KEEP MEN AND WOMEN, ADD MISSING PRACTICE REGION IN AURUM */
if "`database'" == "aurum" {
	rename regstartdate crd
	rename regenddate tod
	drop if gender =="I"
	rename gender string
	gen gender = 1 if string == "M"
	replace gender = 2 if string == "F"
	count
	
	capture program drop addregion
	program define addregion
		args pracid region
		assert region == . if pracid == `pracid'
		replace region = `region' if pracid == `pracid'
	end
	
	addregion 20957 5
	addregion 21338 5
	addregion 21595 2
	addregion 21600 7
	addregion 21629 5

	}
	
if "`database'" == "gold" {
	keep if gender == 1 | gender == 2
	}
		
label define genderlab 1 "Male" 2 "Female"
label values gender genderlab
tab gender, m

label define regionlab 1 "North East" 2 "North West" 3	"Yorkshire And The Humber" 4 "East Midlands" 5	"West Midlands" 6 "East of England" 7 "South West" 8 "South Central" 9 "London" 10 "South East Coast" 11 "Northern Ireland" 12	"Scotland" 13 "Wales"
label values region regionlab


/* START AND END OF FOLLOW UP*/
gen _year40 = yob + 40
gen _day40 = mdy(07,01,_year40)

	if "`linkage'"=="primary" {
		gen startfup = max((crd+365.25), _day40) /*HS change 5th Feb 2021 - added + 365*; HC change to _day40 on 08/02/2021 */
		gen endfup = min(tod, lcd, deathdate)
		gen died = 0
		replace died = 1 if deathdate == endfup
	}

	if "`linkage'"=="linked" {
		gen startfup = max((crd+365.25), _day40, ${studystart_`linkage'}) /*HS change 5th Feb 2021 - added + 365*; HC change to _day40 on 08/02/2021 */
		gen endfup = min(tod, lcd, deathdate, ${studyend_`linkage'}, ${studyend_hes})
		gen died = 0 
		replace died = 1 if deathdate == endfup
	}


/*APPLY CRITERIA THAT WERE MISSED OUT IN BROAD DENOMINATOR POPULATION DEFINITIONS*/

*Exclude if practice LCD date is before the end of the study period
drop if lcd < ${studyend_`linkage'}
count

*Exclude if death date is before the start of follow-up
drop if endfup <= ${studystart_`linkage'}
count

drop if endfup <= startfup

drop if startfup > ${studyend_`linkage'} /*not run before, would reduce the size of the database but nothing to worry about - these people will be excluded from the weekly counts*/

count

keep patid gender yob dob startfup endfup died region
save "$datadir\cr_studypopulation_foraggregate_`database'_`linkage'.dta", replace

capture log close
