Box-Cox Transformation and QQ-Plot

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.3
#create data

y=c(1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 6, 7, 8)
x=c(7, 7, 8, 3, 2, 4, 4, 6, 6, 7, 5, 3, 3, 5, 8)

#fit linear regression model
model <- lm(y~x)

#find optimal lambda for Box-Cox transformation 
bc <- boxcox(y ~ x)

lambda <- bc$x[which.max(bc$y)]

lambda
## [1] -0.4242424
#fit new linear regression model using the Box-Cox transformation
new_model <- lm(((y^lambda-1)/lambda) ~ x)

#define plotting area
op <- par(pty = "s", mfrow = c(1, 2))

#Q-Q plot for the residuals of the original model
qqnorm(model$residuals)
qqline(model$residuals)

#Q-Q plot for Box-Cox transformed model
qqnorm(new_model$residuals)
qqline(new_model$residuals)

#display both Q-Q plots
par(op)

Residuals plot

load("EXEXSAL2.Rdata")

names(EXEXSAL2)
##  [1] "ID"  "Y"   "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10"
EXEXSAL2 <-EXEXSAL2[,-1]

names(EXEXSAL2)
##  [1] "Y"   "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10"
reg_full<-lm(Y~., data=EXEXSAL2)

plot(reg_full)

Example: mtcars dataset

The dataset describes the attibutes of various cars and how these relate to the dependent variable mpg i.e. how to things like weight, no of cylinders and no of gears affect miles per gallon (mpg). For this example we will use mpg (mpg) vs weight (wt).

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
data("mtcars"); head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Fitting the Regression Line and its Residuals 

d <- mtcars
fit <- lm(mpg ~ wt, data = d) # fit the model
d$predicted <- predict(fit)   # Save the predicted values
d$residuals <- residuals(fit) # Save the residual values
ggplot(d, aes(x = wt, y = mpg)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +     # regression line  
  geom_segment(aes(xend = wt, yend = predicted), alpha = .2) +      # draw line from point to line
  geom_point(aes(color = abs(residuals), size = abs(residuals))) +  # size of the points
  scale_color_continuous(low = "green", high = "red") +             # colour of the points mapped to residual size - green smaller, red larger
  guides(color = FALSE, size = FALSE) +                             # Size legend removed
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

  summary(fit) 
## 
## Call:
## lm(formula = mpg ~ wt, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10
  # Residuals vs Fitted Plot
  
  plot(fit, which=1, col=c("blue")) # Residuals vs Fitted Plot

  plot(fit, which=2, col=c("red"))  # Q-Q Plot

  plot(fit, which=3, col=c("blue"))  # Scale-Location Plot

  plot(fit, which=5, col=c("blue"))  # Residuals vs Leverage