/*
   HSR Method I - Week 3 Review of regression
      
*/ 

version 16

cd "H:\Teaching\Methods 2020\lectures\Week 3 Regression review\code"
log using "W3 regression.txt", text replace 


/// --- Data 
use https://www.stata-press.com/data/r16/cattaneo2, clear

desc bweight lbweight mbsmoke msmoke mage medu mrace fbaby    
tabstat bweight, by(mbsmoke) stats(N mean sd min max)

/// --- Birthweight distributes normal, as many biological/physical metrics do 
set scheme s1mono, permanently 
hist bweight, kdensity saving(w_all.gph, replace) title("All")
hist bweight if mbsmoke == 1, kdensity saving(w_smok.gph, replace) title("Smoker")
hist bweight if mbsmoke == 0, kdensity saving(w_nonsmok.gph, replace) title("Non-smoker")
graph combine w_all.gph w_smok.gph w_nonsmok.gph, col(1) xcommon ysize(10)
graph export wg_graph.png, replace

graph combine w_all.gph w_smok.gph w_nonsmok.gph, row(1) ysize(4) xsize(12)
graph export wg_graph1.png, replace


/// --- Exaggerate the difference 
clonevar bweight_fake = bweight
replace bweight_fake = bweight + 1000 if mbsmoke == 0
replace bweight_fake = bweight - 500 if mbsmoke == 1
hist bweight_fake, kdensity percent saving(hist_bwfake, replace)
graph export bw_fake.png, replace


* The linear/OLS is still a good model because normality is conditional on convariates
reg bweight_fake mbsmoke
predict res_std, rstandard

hist res_std, kdensity saving(rno.gph, replace)
qnorm res_std, saving(qno.gph, replace)
graph combine rno.gph qno.gph, row(1)
graph export nor.png, replace

/// ---- Meaning of R^2
reg bweight_fake mbsmoke
capture drop yhat 
predict yhat
corr yhat bweight_fake
di 0.7721 ^2
*  .59613841
* could have done 
return list 
di r(rho)^2

/// ---- Asymptotics 
qui reg bweight mbsmoke mage medu
matrix list e(b)
matrix list e(V)
di sqrt(e(V)[2,2])

* Save 
scalar bsmoke = e(b)[1,1]
scalar bsmoke_se = sqrt(e(V)[1,1])
di bsmoke "   " bsmoke_se


/// -- Centering 
sum mage 
gen mage_c = mage - 35

qui reg bweight i.msmoke##c.mage 
est sto m1 
qui reg bweight i.msmoke##c.mage_c 
est sto m2
est table m1 m2 



/// ---- Wald tests 

* Smoking categories 
desc msmoke 
label list smoke2 
tab msmoke


reg bweight i.msmoke mage medu 

test medu
*test medu =0  
test medu = 1
* Joint: test if effect of 6-10 cigs is same as 11+ daily 
test 2.msmoke= 3.msmoke
* Can't do: test medu > 12 

** More on tests 
qui reg bweight i.mbsmoke##c.mage medu 
* to remember how to name the coefficient use the code below
matrix list e(b)
test mage 1.mbsmoke#c.mage 
* syntax can be confusing
test mage = 1.mbsmoke#c.mage =0 

* Same test comparing between two models
qui reg bweight i.mbsmoke##c.mage medu 
scalar sse_f = e(rss)
qui reg bweight i.mbsmoke medu 
scalar sse_r = e(rss)
di ((sse_r - sse_f)/2)/(sse_f/(4642-4-1))

      
* asymptotically equivalent to likelihood ratio tests comparing two models
qui reg bweight i.mbsmoke##c.mage medu 
est sto mf
qui reg bweight i.mbsmoke medu 
est sto mr
lrtest mf mr
 

/// --- ANOVA 
reg bweight i.msmoke 
test (1.msmoke=0) (2.msmoke=0) (3.msmoke=0)


* anova command
anova bweight i.msmoke
* type help contrast 
contrast a.msmoke

/// t-tests and regression 
reg bweight i.mbsmoke
ttest bweight, by(mbsmoke)


/// --- Confidence interval 
reg bweight i.mbsmoke mage medu 
test 1.mbsmoke = -292.2675
test 1.mbsmoke = -206.7613
test 1.mbsmoke = -250
test 1.mbsmoke = -300


* We are saying that the coefficient of mbsmoke distributes normal with 
* mean -249.5144 and standard deviation (the standard error) of 21.80749 
* The Wald test of the null beta=0 distributes t-student
* The estimated CI is [-292.2675, -206.7613]

* Let's simulate the standardized coefficient for smoke  
preserve 
   clear
   set seed 1234567066
   set obs 10000
   * The rt function generates t-student distribution with mean zero and sd 1
   gen zt = rt(4642-4)
   * Make it have a mean of bsmoke and sd of bsmoke_se
   gen beta_sm_sd = bsmoke_se*zt + bsmoke
   * Get the 2,5 and 97.5 percentile   
   _pctile beta_sm_sd, p(2.5)
   local ci_lb=r(r1)
   _pctile beta_sm_sd, p(97.5)
   local ci_ub=r(r1)
   di "["  `ci_lb' ", " `ci_ub' "]"
   * Another way to get the percentile
   *centile beta_sm_sd, centile(2.5(5)97.5)
   * How many times are the simulated coefficients within the confidence interval?
   qui count if beta_sm_sd >= -292.2675 & beta_sm_sd <= -206.7613
   di r(N)/10000
   * More nifty. What is the probability that the difference in birthweight between 
   * smokers and non-smokers is greater than -300 grams? 
   qui count if beta_sm_sd >= -310
   di r(N)/10000
restore 


/// --- Linear probability model 
sum lbweight mage
reg lbweight mage, robust
predict yhat_ols
line yhat_ols mage, color(red) saving(pred_ols.gph, replace) 

* Compare to logistic or probit
logit lbweight mage, nolog
predict yhat_logit
line yhat_logit mage, sort color(blue) saving(pred_logit.gph, replace) 
margins, dydx(mage)

probit lbweight mage, nolog
margins, dydx(mage)

* Intuition 
graph combine pred_ols.gph pred_logit.gph, row(1)
graph export compare_preds.png, replace


* Compare to quadratic specification 
reg lbweight c.mage##c.mage, robust
margins, dydx(c.mage)
predict yhat_quad

line yhat_quad mage, sort
* not great 

/// --- A nice way to explore the shape of a 0/1 variable in relation to another var
* Lowess is a smoother method, semiparametric
lowess lbweight mage, gen(y_smooth)

* make it pretty 
scatter lbweight mage, jitter(2) msize(vsmall) legend(off) ///
        || line y_smooth mage, sort color(red) 
graph export low.png, replace

* There is an option to transform into logits: 
* Logits are log-odds: log(p/(1-p))
lowess lbweight mage, logit  saving(y_smooth_l.gph, replace)

* That's the smoothed version of this 
logit lbweight mage
predict yhat_logit1, xb 
line yhat_logit1 mage, sort saving(logit.gph, replace)

graph combine y_smooth_l.gph logit.gph, col(1) xcommon ysize(7)


/// --- Lowess is easier to undertand, but nonparametric methods now have their own 
* command: npregress, using kernel (just a weighting function) or series
* it can be slow with a lot of observations 
npregress kernel lbweight mage
npgraph

* Or make your own graphs 
npregress kernel lbweight mage, predict(y_kernel deriv)
sum y_kernel deriv
* note this matches the output 
scatter lbweight mage, jitter(3) || line y_kernel mage, sort color(red)

line y_kernel mage, sort color(red)
graph export ykernel.png, replace


/// --- Interactions and stratification 
* Run stratified models and save coefficients
qui reg lbweight mage if mbsmoke == 1, robust
local m1_alpha1 = _b[mage]
qui reg lbweight mage if mbsmoke == 0, robust
local m1_gamma1 = _b[mage]

* Difference 
di `m1_alpha1' - `m1_gamma1'
*.006432

* Should match interaction term
qui reg lbweight c.mage##i.mbsmoke, robust

di _b[1.mbsmoke#c.mage]
*.006432


log close