/* Cost data and GLM October 2020 */ use http://www.stata-press.com/data/heus/heus_mepssample, clear cd "H:\Teaching\Methods 2020\lectures\Week 7 costs and GLM\code" desc exp_* age female pcs race* // --- Medical costs characteristics sum exp_tot, det sum exp_ip, det sum exp_er, det * histogram hist exp_tot, kdensity title("Total Expenses 2004") saving(thist1.gph, replace) hist exp_tot if exp_tot < 100000, kdensity title("Total Expenses 2004") saving(thist2.gph, replace) graph combine thist1.gph thist2.gph, ysize(10) xsize(20) graph export histc.png, replace * all ages tabstat exp_tot, stats(N mean p5 p10 p50 p75 p90 p99) * older than 75 tabstat exp_tot if age >75, stats(N mean p5 p10 p50 p75 p90 p99) hist exp_tot if age > 75, kdensity title("Total Expenses 2004") saving(thist1.gph, replace) gen zero = 0 replace zero = 1 if exp_tot ==0 tab zero tab zero if age > 75 * Still a problem without the zeroes tabstat exp_tot if exp_tot >0, stats(N mean sd p5 p10 p50 p75 p90 p99 min max) hist exp_tot if exp_tot >0, kdensity graph export noz.png, replace // --- OLS models reg exp_tot age i.female pcs race* eth_hisp, robust predict yhat sum yhat qui reg exp_tot age female pcs predict res, rstandard qnorm res, saving(qno.gph, replace) kdensity res, saving(hisres.gph, replace) graph combine qno.gph hisres.gph graph export res.png, replace // box-cox gen exp_tot1 = exp_tot + 1 boxcox exp_tot1 age female pcs race* eth_hisp, model(lhsonly) lrtest nolog nologlr boxcox exp_tot1 age female pcs race* eth_hisp if exp_tot > 0, model(lhsonly) lrtest nolog nologlr // -- log transformation gen lexp = log(1+exp_tot) kdensity lexp, title("ln(exp_tot + 1)") graph export lexp.png, replace reg lexp age female pcs race* eth_hisp predict resl, rstandard qnorm resl, saving(qnol.gph, replace) kdensity resl, saving(hisresl.gph, replace) graph combine qnol.gph hisresl.gph graph export resl.png, replace reg lexp age female pcs race* di 100*(exp(_b[eth_hisp]) -1 ) // --- Duan's smearing factor * Estimate log model qui reg lexp age female pcs race* eth_hisp if exp_tot > 0 predict epsilonhat if e(sample), residual * predictions in ln scale predict lyhat if e(sample) * Exponent of predictions gen explyhat = exp(lyhat) * Duan's smearing factor egen dduan = mean(exp(epsilonhat)) * Transform exponent of predictions gen yhatduan = explyhat * dduan sum yhatduan exp_tot if exp_tot > 0 /// --- GLM reg exp_tot age i.female pcs race* eth_hisp, robust glm exp_tot age i.female pcs race* eth_hisp, family(gaussian) link(identity) vce(robust) qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(gaussian) link(log) vce(robust) di exp(_b[1.female]) margins i.female, post di _b[1.female]/_b[0.female] glm exp_tot age i.female pcs i.race* i.eth_hisp, /// family(gaussian) link(log) vce(robust) nolog eform glm exp_tot age i.female pcs i.race* i.eth_hisp, /// family(gaussian) link(log) vce(robust) nolog margins, dydx(*) /// --- Check residuals qui glm exp_tot age i.female pcs i.race* i.eth_hisp, /// family(gaussian) link(log) vce(robust) nolog predict double resloglink if e(sample), deviance hist resloglink, kdensity saving(glmres.gph, replace) qnorm resloglink, saving(glmresqnorm.gph, replace) graph combine glmres.gph glmresqnorm.gph, xsize(10) ysize(5) graph export glmg.png, replace sum resloglink if resloglink > 50000 qui glm lexp age i.female pcs i.race* i.eth_hisp, /// family(gaussian) link(log) vce(robust) nolog predict double resloglink1 if e(sample), deviance margins, dydx(*) hist resloglink1, kdensity saving(glmres1.gph, replace) qnorm resloglink1, saving(glmresqnorm1.gph, replace) graph combine glmres1.gph glmresqnorm1.gph, xsize(10) ysize(5) graph export glmg1.png, replace sum resloglink if resloglink > 50000 // --- Gamma glm exp_tot age i.female pcs i.race* i.eth_hisp, /// family(gamma) link(log) nolog di exp(_b[1.female]) margins, dydx(*) qui glm exp_tot age i.female pcs i.race* i.eth_hisp, /// family(gamma) link(log) nolog predict double resgammalog if e(sample), anscombe hist resgammalog, kdensity saving(glmg.gph, replace) qnorm resgammalog, saving(glmg1.gph, replace) graph combine glmg.gph glmg1.gph, xsize(10) ysize(5) graph export resgammalog.png, replace glm exp_tot age i.female pcs i.race* i.eth_hisp if exp_tot > 0 , /// family(gamma) link(log) nolog predict double resgammalog1 if e(sample), anscombe hist resgammalog1, kdensity saving(glmg1.gph, replace) qnorm resgammalog1, saving(glmg11.gph, replace) graph combine glmg1.gph glmg11.gph, xsize(10) ysize(5) graph export resgammalog1.png, replace glm exp_tot i.female##(c.age##c.age c.pcs i.race* i.eth_hisp) if exp_tot > 0, /// family(gamma) link(log) nolog capture drop resgammalog2 predict double resgammalog2 if e(sample), anscombe qnorm resgammalog2 glm exp_tot i.female##(c.age##c.age c.pcs i.race* i.eth_hisp) if exp_tot > 0, /// family(poisson) link(log) nolog capture drop resgammalog2 predict double resgammalog2 if e(sample), anscombe qnorm resgammalog2 // --- Meaning of coeffs * No covariates qui reg exp_tot if exp_tot > 0 margins qui glm exp_tot if exp_tot > 0, family(gamma) nolog margins qui glm exp_tot if exp_tot > 0, family(gamma) link(log) nolog margins * Only one dummy tabstat exp_tot if exp_tot >0, by(female) qui reg exp_tot i.female if exp_tot > 0 margins female qui glm exp_tot i.female if exp_tot > 0, family(gamma) /*link(log)*/ nolog margins female qui glm exp_tot i.female if exp_tot > 0, family(gamma) link(log) nolog margins female * Only one continous qui reg exp_tot age if exp_tot > 0 margins, dydx(age) qui glm exp_tot age if exp_tot > 0, family(gamma) /*link(log)*/ nolog margins, dydx(age) qui glm exp_tot age if exp_tot > 0, family(gamma) link(log) nolog margins, dydx(age) qui reg exp_tot age if exp_tot > 0 predict double yhatols if e(sampple) margins, dydx(age) at(age=53) qui glm exp_tot age if exp_tot > 0, family(gamma) link(log) nolog predict double yhatglmgamma if e(sample) margins, dydx(age) at(age=52) qui glm exp_tot age if exp_tot > 0, family(gamma) /*link(log)*/ nolog predict double yhatnolog if e(sample) margins, dydx(age) at(age=52) line yhatnolog age, sort line yhatols age, sort || line yhatglmgamma age, color(red) sort xline(52) /// legend(off) graph export g1.png, replace qui glm exp_tot age if exp_tot > 0, family(gamma) /*link(log)*/ nolog predict double yhatnolog if e(sample) line yhatnolog age, sort line yhatglmgamma age, color(red) sort /// legend(off) || line yhatnolog age, sort color(blue) graph export g2.png, replace // --- Explore nonlinearity with two variables qui reg exp_tot i.female age if exp_tot > 0 margins, dydx(*) qui glm exp_tot i.female age if exp_tot > 0, family(gamma) /*link(log)*/ nolog margins, dydx(*) qui glm exp_tot i.female age if exp_tot > 0, family(gamma) link(log) nolog margins, dydx(*) /// Compare models * Null qui glm exp_tot, family(gamma) link(log) est sto nullm * Log links qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(gamma) link(log) est sto glmgammalog qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(gaussian) link(log) est sto glmgaulog * Gamma with other link qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(gamma) link(power 0.5) est sto glmgammapower5 * Poisson qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(poisson) scale(x2) link(log) est sto glmpoisoonlog estimates stats nullm glmgammalog glmgaulog glmgammapower5 glmpoisoonlog /// --- Two-part models gen nonzero = 0 replace nonzero = 1 if exp_tot > 0 & exp_tot ~= . * First part qui logit nonzero age i.female pcs i.race* i.eth_hisp, nolog predict double pnonzero * Second part qui glm exp_tot age i.female pcs i.race* i.eth_hisp, family(gamma) link(log) predict double exphat gen tpmhat = pnonzero * exphat sum exp_tot tpmhat twopm exp_tot age i.female pcs i.race* i.eth_hisp, /// firstpart(logit, nolog) secondpart(glm, family(gamma) link(log) nolog) margins, dydx(*)