A machine learning approach to standard linear regression

Introduction linear regression with gradient descent

This tutorial is a rough introduction into using gradient descent algorithms to estimate parameters (slope and intercept) for standard linear regressions, as an alternative to ordinary least squares (OLS) regression with a maximum likelihood estimator. To begin, I simulate data to perform a standard OLS regression with maximum likelihood using sums of squares. Once explained, I then demonstrate how to substitute gradient descent simply and interpret results.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## Warning: package 'readr' was built under R version 4.1.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --

Ordinary Least Square Regression

Simulate data

Generate random data in which y is a noisy function of x

set.seed(123)

x <- runif(1000, -5, 5)
y <- x + rnorm(1000) + 3

Fit a linear model

lm <- lm( y ~ x ) # Ordinary Least Squares regression with General Linear Model
mod <- print(lm)

##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept)            x
##      3.0118       0.9942

mod

##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept)            x
##      3.0118       0.9942

Plot the data and the model

plot(x,y, col = "grey80", main='Regression using lm()', xlim = c(-2, 5), ylim = c(0,10));
text(0, 8, paste("Intercept = ", round(mod$coefficients, 2), sep = "")); text(4, 2, paste("Slope = ", round(mod$coefficients, 2), sep = ""));
abline(v = 0, col = "grey80"); # line for y-intercept
text(6, 3, paste("Slope = ", round(mod$coefficients, 2), sep = "")); abline(a = mod$coefficients, b = mod$coefficients, col='blue', lwd=2) # use slope and intercept to plot best fit line Calculate intercept and slope using sum of squares x <- df$Body_size_mm
y <- df\$Size_at_maturity_max_mm
x_bar <- mean(x) # calculate mean of independent variable
y_bar <- mean(y) # calculate mean of dependent variable

slope <- sum((x - x_bar)*(y - y_bar))/sum((x - x_bar)^2) # calculate sum of differences between x & y, and divide by sum of squares of x
slope

##  0.7264703

intercept <- y_bar - (slope * x_bar) # calculate difference of y_bar across the linear predictor
intercept

##  0.6237047

### plot data using manually calculated parameters
plot(x,y, col = "grey80", main='Linear Regression using Ordinary Least Squares', xlab = "Body size (mm)", ylab = "Max size at maturity (mm)");
abline(a = intercept, b = slope, col='blue', lwd=2)

Squared error cost function (a way to calculate the degree of error for a guess for slope and intercept)

### learning rate and iteration limit
alpha <- 0.001
num_iters <- 1000

### keep history
cost_history <- double(num_iters)
theta_history <- list(num_iters)

### initialize coefficients
theta <- matrix(c(0,0), nrow=2)

### add a column of 1's for the intercept coefficient
X <- cbind(1, matrix(x))

for (i in 1:num_iters) {
error <- (X %*% theta - y)
delta <- t(X) %*% error / length(y)
theta <- theta - alpha * delta
cost_history[i] <- cost(X, y, theta)
theta_history[[i]] <- theta
}

print(theta)

##           [,1]
## [1,] 0.1816407
## [2,] 0.8175962

Plot data and converging fit

plot(x,y, col="grey80", main='Linear regression using Gradient Descent', xlab = "Body size (mm)", ylab = "Max size at maturity (mm)")
for (i in c((1:31)^2, 1000)) {
abline(coef=theta_history[[i]], col="red")
}
abline(coef=theta, col="blue", lwd = 2) 