/* HSR Method I - Week 3 Review of regression */ version 16 cd "H:\Teaching\Methods 2020\lectures\Week 3 Regression review\code" log using "W3 regression.txt", text replace /// --- Data use https://www.stata-press.com/data/r16/cattaneo2, clear desc bweight lbweight mbsmoke msmoke mage medu mrace fbaby tabstat bweight, by(mbsmoke) stats(N mean sd min max) /// --- Birthweight distributes normal, as many biological/physical metrics do set scheme s1mono, permanently hist bweight, kdensity saving(w_all.gph, replace) title("All") hist bweight if mbsmoke == 1, kdensity saving(w_smok.gph, replace) title("Smoker") hist bweight if mbsmoke == 0, kdensity saving(w_nonsmok.gph, replace) title("Non-smoker") graph combine w_all.gph w_smok.gph w_nonsmok.gph, col(1) xcommon ysize(10) graph export wg_graph.png, replace graph combine w_all.gph w_smok.gph w_nonsmok.gph, row(1) ysize(4) xsize(12) graph export wg_graph1.png, replace /// --- Exaggerate the difference clonevar bweight_fake = bweight replace bweight_fake = bweight + 1000 if mbsmoke == 0 replace bweight_fake = bweight - 500 if mbsmoke == 1 hist bweight_fake, kdensity percent saving(hist_bwfake, replace) graph export bw_fake.png, replace * The linear/OLS is still a good model because normality is conditional on convariates reg bweight_fake mbsmoke predict res_std, rstandard hist res_std, kdensity saving(rno.gph, replace) qnorm res_std, saving(qno.gph, replace) graph combine rno.gph qno.gph, row(1) graph export nor.png, replace /// ---- Meaning of R^2 reg bweight_fake mbsmoke capture drop yhat predict yhat corr yhat bweight_fake di 0.7721 ^2 * .59613841 * could have done return list di r(rho)^2 /// ---- Asymptotics qui reg bweight mbsmoke mage medu matrix list e(b) matrix list e(V) di sqrt(e(V)[2,2]) * Save scalar bsmoke = e(b)[1,1] scalar bsmoke_se = sqrt(e(V)[1,1]) di bsmoke " " bsmoke_se /// -- Centering sum mage gen mage_c = mage - 35 qui reg bweight i.msmoke##c.mage est sto m1 qui reg bweight i.msmoke##c.mage_c est sto m2 est table m1 m2 /// ---- Wald tests * Smoking categories desc msmoke label list smoke2 tab msmoke reg bweight i.msmoke mage medu test medu *test medu =0 test medu = 1 * Joint: test if effect of 6-10 cigs is same as 11+ daily test 2.msmoke= 3.msmoke * Can't do: test medu > 12 ** More on tests qui reg bweight i.mbsmoke##c.mage medu * to remember how to name the coefficient use the code below matrix list e(b) test mage 1.mbsmoke#c.mage * syntax can be confusing test mage = 1.mbsmoke#c.mage =0 * Same test comparing between two models qui reg bweight i.mbsmoke##c.mage medu scalar sse_f = e(rss) qui reg bweight i.mbsmoke medu scalar sse_r = e(rss) di ((sse_r - sse_f)/2)/(sse_f/(4642-4-1)) * asymptotically equivalent to likelihood ratio tests comparing two models qui reg bweight i.mbsmoke##c.mage medu est sto mf qui reg bweight i.mbsmoke medu est sto mr lrtest mf mr /// --- ANOVA reg bweight i.msmoke test (1.msmoke=0) (2.msmoke=0) (3.msmoke=0) * anova command anova bweight i.msmoke * type help contrast contrast a.msmoke /// t-tests and regression reg bweight i.mbsmoke ttest bweight, by(mbsmoke) /// --- Confidence interval reg bweight i.mbsmoke mage medu test 1.mbsmoke = -292.2675 test 1.mbsmoke = -206.7613 test 1.mbsmoke = -250 test 1.mbsmoke = -300 * We are saying that the coefficient of mbsmoke distributes normal with * mean -249.5144 and standard deviation (the standard error) of 21.80749 * The Wald test of the null beta=0 distributes t-student * The estimated CI is [-292.2675, -206.7613] * Let's simulate the standardized coefficient for smoke preserve clear set seed 1234567066 set obs 10000 * The rt function generates t-student distribution with mean zero and sd 1 gen zt = rt(4642-4) * Make it have a mean of bsmoke and sd of bsmoke_se gen beta_sm_sd = bsmoke_se*zt + bsmoke * Get the 2,5 and 97.5 percentile _pctile beta_sm_sd, p(2.5) local ci_lb=r(r1) _pctile beta_sm_sd, p(97.5) local ci_ub=r(r1) di "[" `ci_lb' ", " `ci_ub' "]" * Another way to get the percentile *centile beta_sm_sd, centile(2.5(5)97.5) * How many times are the simulated coefficients within the confidence interval? qui count if beta_sm_sd >= -292.2675 & beta_sm_sd <= -206.7613 di r(N)/10000 * More nifty. What is the probability that the difference in birthweight between * smokers and non-smokers is greater than -300 grams? qui count if beta_sm_sd >= -310 di r(N)/10000 restore /// --- Linear probability model sum lbweight mage reg lbweight mage, robust predict yhat_ols line yhat_ols mage, color(red) saving(pred_ols.gph, replace) * Compare to logistic or probit logit lbweight mage, nolog predict yhat_logit line yhat_logit mage, sort color(blue) saving(pred_logit.gph, replace) margins, dydx(mage) probit lbweight mage, nolog margins, dydx(mage) * Intuition graph combine pred_ols.gph pred_logit.gph, row(1) graph export compare_preds.png, replace * Compare to quadratic specification reg lbweight c.mage##c.mage, robust margins, dydx(c.mage) predict yhat_quad line yhat_quad mage, sort * not great /// --- A nice way to explore the shape of a 0/1 variable in relation to another var * Lowess is a smoother method, semiparametric lowess lbweight mage, gen(y_smooth) * make it pretty scatter lbweight mage, jitter(2) msize(vsmall) legend(off) /// || line y_smooth mage, sort color(red) graph export low.png, replace * There is an option to transform into logits: * Logits are log-odds: log(p/(1-p)) lowess lbweight mage, logit saving(y_smooth_l.gph, replace) * That's the smoothed version of this logit lbweight mage predict yhat_logit1, xb line yhat_logit1 mage, sort saving(logit.gph, replace) graph combine y_smooth_l.gph logit.gph, col(1) xcommon ysize(7) /// --- Lowess is easier to undertand, but nonparametric methods now have their own * command: npregress, using kernel (just a weighting function) or series * it can be slow with a lot of observations npregress kernel lbweight mage npgraph * Or make your own graphs npregress kernel lbweight mage, predict(y_kernel deriv) sum y_kernel deriv * note this matches the output scatter lbweight mage, jitter(3) || line y_kernel mage, sort color(red) line y_kernel mage, sort color(red) graph export ykernel.png, replace /// --- Interactions and stratification * Run stratified models and save coefficients qui reg lbweight mage if mbsmoke == 1, robust local m1_alpha1 = _b[mage] qui reg lbweight mage if mbsmoke == 0, robust local m1_gamma1 = _b[mage] * Difference di `m1_alpha1' - `m1_gamma1' *.006432 * Should match interaction term qui reg lbweight c.mage##i.mbsmoke, robust di _b[1.mbsmoke#c.mage] *.006432 log close