######################################
# Statistics for data science: R 
# 
# http://didawiki.di.unipi.it/doku.php/mds/sds/
######################################

# [CTRL-Enter] to execute a line

###### Script Lesson 20: Linear Regression and LSE

par(mar=c(4,4,1,1))

# load dataset
library(SemiPar)
data(janka)
x = janka$dens
y = janka$hardness

# scatter plot: pch is type of points, cex is their size
plot(x, y, xlab='dens', ylab='hadness', xlim=c(20, 80), ylim=c(0,3500), pch=16, cex=0.8)

# simple linear model
fit = lm(y~x) # linear formula y~x
fit
abline(fit, col='red', lw=2) # regression line

# extract parameters
lm_pars = coef(fit) 
lm_pars
intercept = lm_pars[1]
slope = lm_pars[2]
r_intercept = round(intercept, 4) 
r_slope = round(slope, 4) 
# display equation 
mtext(bquote(y == .(r_intercept) + .(r_slope) * x),  adj=1, padj=0, col='red') 

slope
# LSE formula for slope
cov(x, y)/var(x)
# Pearson's linear correlation is scale independent
cor(x, y)
# slope vs Pearson's linear correlation
cor(x, y)*sd(y)/sd(x)
# for scaled data scale(v) = (v-mean(v))/sd(v)
scaledx = scale(x)
scaledy = scale(y)
plot(scaledx, scaledy, pch=16, cex=0.8)
fit2 = lm(scaledy~scaledx)
fit2
abline(fit2, col='red', lw=2) # regression line
# beta = Pearson's correlation coefficient
cor(scaledx, scaledy)

# Given: y = alpha + beta*x
# Is it true?: x = -alpha/beta + 1/beta*y
fit3 = lm(x~y)
fit3
1/r_slope # answer: no!
# reason: formula of slope
cov(x, y)/var(y)
r_slope*var(x)/var(y)

# plot residuals
fit_ys = fitted(fit) # same as alpha + beta*x
res = resid(fit) #  same as y - fit_ys
# fitted vs residuals - note *heteroscedasticity*
plot(fit_ys, res, xlab="fitted", ylab="residuals") 
# summary info
summary(fit)

# standard errors
n = length(x)
s.hat = sqrt(sum(res^2)/(n-2))
cat('Residual std error', s.hat, 'on', (n-2), 'degrees of freedom')
xbar = mean(x)
SXX = sum((x-xbar)^2) # or var(x)*(n-1)
se.beta.hat = s.hat/sqrt(SXX)
cat('Beta std error', se.beta.hat)
se.alpha.hat = s.hat*sqrt((1/n + xbar^2/SXX))
cat('Alpha std error', se.alpha.hat)

# summary info
summary(fit)
# coefficient of determination
r2 = 1-var(res)/var(y)
cat('R-squared', r2)
# alternative definition
r2 = var(fit_ys)/var(y)
cat('R-squared', r2)
# for simple (one independent variable) linear regression
# r^2 = pearson^2 between data and fitted
r2 = cor(y, fit_ys)^2 
cat('R-squared', r2)
# adjusted R^2 use unbiased estimators of variance
k = 2 # number of parameters (can be > 2 for multiple regression)
adj_r2 = 1-var(res)/var(y)*(n-1)/(n-k)
cat('Adjusted R-squared', adj_r2)
# diagnostic plots, see https://data.library.virginia.edu/diagnostic-plots/
plot(fit)

# Attention! Anscombe's Quartet
data(anscombe)
View(anscombe)
attach(anscombe)
plot(x1, y1);fit1 = lm(y1~x1); abline(fit1, col='red', lw=2)
plot(x2, y2);fit2 = lm(y2~x2); abline(fit2, col='red', lw=2)
plot(x3, y3);fit3 = lm(y3~x3); abline(fit3, col='red', lw=2)
plot(x4, y4);fit4 = lm(y4~x4); abline(fit4, col='red', lw=2)
coef(fit1)
coef(fit2)
coef(fit3)
coef(fit4)
# study residuals!
plot(fitted(fit1), resid(fit1), xlab="fitted", ylab="residuals") 
plot(fitted(fit2), resid(fit2), xlab="fitted", ylab="residuals") 
plot(fitted(fit3), resid(fit3), xlab="fitted", ylab="residuals") 
plot(fitted(fit4), resid(fit4), xlab="fitted", ylab="residuals") 
# leverage measures distance from mean in dependent variable space 
hatvalues(fit4)