capture log close
log using "$logdir\an_studypopulation_flowchart.txt", text replace

assert "$studyend_primary" == "d(31/07/2020)"
assert $wave1start == 10 /*i.e. 5th March 2020*/
*change description and code below if asserts fail

*import Aurum and rename vars to match gold
use "$denominatordir_aurum\\${buildyear}${buildmonth}_CPRDAurum_AcceptablePats", clear
keep if accept == 1
keep patid pracid gender yob uts region lcd regstartdate regenddate cprd_ddate
rename regstartdate crd
rename regenddate tod
rename cprd_ddate deathdate
drop if gender =="I"
rename gender string
gen gender = 1 if string == "M"
replace gender = 2 if string == "F"
replace gender = . if string == "I"
drop string
gen database = "aurum"
tempfile temp
save `temp'

*import gold and format date vars
use "$rawdatadir\GOLD_Denominators_${buildyear}_${buildmonth}\\cr_denominatorfiles_gold.dta", clear
keep if accept == 1
keep patid gender yob crd tod deathdate pracid region lcd uts
foreach var in crd tod deathdate lcd uts {
	di "`var'"
	rename `var' _`var'
	gen `var' = date(_`var', "DMY")
	drop _`var'
	format `var' %td
	}
tostring patid, replace
gen database = "gold"

*append gold and aurum and create new vars
append using `temp'

gen dob = mdy(07,01,yob) 
gen _year40 = yob + 40
gen _day40 = mdy(07,01,_year40)

gen startfup = max((crd+365.25), _day40) 
gen endfup = min(tod, lcd, deathdate)

foreach var in _day40 startfup endfup {
	format `var' %td
}


/*
Data collected from general practices contributing to CPRD* (October 2020 release)
CPRD Aurum n = XXXX practices, XXXX subjects
CPRD GOLD n = XXXX practices, XXXX subjects
Total n = XXXX practices, XXXX subjects
*/

distinct pracid if database == "aurum"
di as yellow "CPRD Aurum n = `r(ndistinct)' practices"
count if database == "aurum"
di as yellow "CPRD Aurum n = `r(N)' subjects"


distinct pracid if database == "gold"
di as yellow "CPRD GOLD n = `r(ndistinct)' practices" 
count if database == "gold"
di as yellow "CPRD GOLD n = `r(N)' subjects"


distinct pracid
di as yellow "Total n = `r(ndistinct)' practices"
count
di as yellow "Total n = `r(N)' subjects"


/*EXC: Practice last collection data before 31st July 2020
CPRD Aurum n = XXXX practices, XXXX subjects
CPRD GOLD n = XXXX practices, XXXX subjects
Total n = XXXX practices, XXXX subjects */

*See check at end to make sure that no practices are in both databases

distinct pracid if lcd < d(31/07/2020) & database == "aurum"
di as yellow "CPRD Aurum n = `r(ndistinct)' practices"

count if lcd < d(31/07/2020) & database == "aurum"
di as yellow "CPRD Aurum n = `r(N)' subjects"


distinct pracid if lcd < d(31/07/2020) & database == "gold"
di as yellow "CPRD GOLD n = `r(ndistinct)' practices"

count if lcd < d(31/07/2020) & database == "gold"
di as yellow "CPRD GOLD n = `r(N)' subjects"

distinct pracid if lcd < d(31/07/2020)
di as yellow "Total n = `r(ndistinct)' practices"

count if lcd < d(31/07/2020)
di as yellow "Total n = `r(N)' subjects"


/*Registered in general practice contributing to CPRD on 31st July 2020 CPRD Aurum n = XXXX practices, XXXX subjects
CPRD GOLD n = XXXX practices, XXXX subjects
Total n = XXXX practices, XXXX subjects */
drop if lcd < d(31/07/2020)

distinct pracid if database == "aurum"
di as yellow "CPRD Aurum n = `r(ndistinct)' practices"
count if database == "aurum"
di as yellow "CPRD Aurum n = `r(N)' subjects"

distinct pracid if database == "gold"
di as yellow "CPRD GOLD n = `r(ndistinct)' practices" 
count if database == "gold"
di as yellow "CPRD GOLD n = `r(N)' subjects"

distinct pracid
di as yellow "Total n = `r(ndistinct)' practices"
count
di as yellow "Total n = `r(N)' subjects"


/*EXC: Current registration date after 31st July 2020 (XXXX subjects)
Left practice or died before 5th March 2015(XXXX subjects)
less than 40 at end of follow-up or on 31st July 2020
Total excluded (XXXX subjects) */

count if crd > d(31/07/2020)
di as yellow "crd > 31st July 2020 = `r(N)' subjects"

count if _day40 > d(31/07/2020) | _day40 > endfup
di as yellow "less than 40 at end of followup or on 31st July 2020"

count if endfup < d(05/03/2015) /*01/03/2015 in orig study population*/
di as yellow "left practice or died before 5th March 2015 = `r(N)' subjects"

count if crd > d(31/07/2020) | endfup < d(05/03/2015) | _day40 > d(31/07/2020) | _day40 > endfup
di as yellow "total exclusions = `r(N)' subjects"



/*Registered in general practice between 5 March 2015 and 31 July 2020 when aged >= 40 (n = XXXX subjects)*/
drop if crd > d(31/07/2020) | endfup < d(05/03/2015) | _day40 > d(31/07/2020) | _day40 > endfup
count
di as yellow "Total n = `r(N)' subjects"

 
/*EXC: 
Less than 1 year of follow up before 31 July 2020 (XXXX subjects)
Total excluded (XXXX subjects)*/

gen temp = 1 if startfup > endfup /*less than a year of fup*/
replace temp = 1 if startfup > d(31/07/2020)
count if temp == 1
di as yellow "less than 1 year of follow-up before 31 July 2020 = `r(N)' subjects"

di as yellow "total exclusions = `r(N)' subjects"


/*registered for at least a year at end of followup or before 31 July 2020  (n = XXXX subjects)*/
drop if temp == 1
count
local totaltocheck = `r(N)'
di as yellow "Total n = `totaltocheck' subjects"

/*
/*check against cr_studypopulation_foraggregate*/
save "$datadir\_temp.dta", replace

use "$datadir\cr_studypopulation_foraggregate_gold_primary.dta", clear
gen database = "gold"
gen pracid = mod(patid, 1000)
tostring patid, replace
append using "$datadir\cr_studypopulation_foraggregate_aurum_primary.dta"
rename endfup endfup_foraggregate
rename startfup startfup_foraggregate
replace database = "aurum" if database == ""
gen _pracid = substr(patid, -5, .) if database == "aurum"
destring _pracid, replace
replace pracid = _pracid if database == "aurum"
drop _pracid

merge 1:1 patid using "$datadir\_temp.dta", update replace
keep if _merge !=3
keep patid
save "$datadir\_temp.dta", replace
/*
merged this with data from raw denoms (see top of this file)
additional patids in aggregate study population either started fup after study end
or ended follow-up before study started
CHECKS COMPLETE - ALL OK
*/
*/

/*WAVE 1*/
count if startfup < d(29/05/2020) & endfup >=d(05/03/2020)
di as yellow "Total included in Wave 1 n = `r(N)' subjects"

/*AVERAGE FOLLOW-UP (WEEKS)*/
gen studystartfup = max(startfup, d(05/03/2015))
gen studyendfup = min(endfup, d(31/07/2020))
gen totalfup = (studyendfup - studystartfup)/365.25
summ totalfup, d

*proportion followed up for whole of study period
count
local denom = `r(N)'
qui summ totalfup
count if totalfup == `r(max)'
local num = `r(N)'
di (`num'/ `denom')*100 


***Wave 1

gen wave1startfup = max(startfup, d(05/03/2020))
gen wave1endfup = min(endfup, d(27/05/2020))
gen wave1fup = (wave1endfup - wave1startfup)
summ wave1fup if wave1fup > 0, d
*proportion followed up for whole of wave 1
count if wave1fup == 83
local num = `r(N)'
count if wave1fup > 0
local denom = `r(N)'
di (`num'/ `denom')*100 

/*LINKED?*/

/*make sure no practices left CPRD GOLD on or after July 31st 2020 and joined
Aurum and vice versa*/
use "$resourcedir_aurum\Vision to Emis Migrators\\${buildyear}${buildmonth}VisiontoEmisMigrators.dta", clear

gen goldlcd = date(gold_lcdate, "DMY")
format goldlcd %dD/N/CY
drop gold_lcdate
assert goldlcd < $studyend_primary
summ goldlcd, format

use `temp', clear
