# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(ggplot2)
# Define column names
column_names <- c(
"symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system",
"bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
"city_mpg", "highway_mpg", "price"
)
# Load the dataset using from the UCI URL
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
auto_data <- read_csv(url, col_names = column_names, na = "?")
## Rows: 205 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): make, fuel_type, aspiration, num_doors, body_style, drive_wheels, ...
## dbl (16): symboling, normalized_losses, wheel_base, length, width, height, c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert appropriate columns to numeric
numeric_cols <- c("normalized_losses", "bore", "stroke", "horsepower",
"peak_rpm", "price")
auto_data[numeric_cols] <- lapply(auto_data[numeric_cols], as.numeric)
# Drop rows with missing price
auto_data_clean <- auto_data %>% drop_na(price)
# Convert categorical variables to factors
auto_data_clean <- auto_data_clean %>%
mutate(across(c(make, fuel_type, aspiration, num_doors, body_style,
drive_wheels, engine_location, engine_type,
num_cylinders, fuel_system), as.factor))
hist(auto_data_clean$price,
breaks = 20,
col = "skyblue",
border = "white",
main = "Distribution of Car Prices",
xlab = "Price (USD)",
ylab = "Frequency")
boxplot(auto_data_clean$price,
main = "Boxplot of Vehicle Prices",
ylab = "Price (USD)",
col = "skyblue",
border = "black",
horizontal = FALSE,
cex.main = 1.2,
cex.lab = 1.1)
# Select numeric columns
numeric_data <- auto_data_clean %>% select(where(is.numeric))
# Calculate correlations with 'price' only
cor_with_price <- cor(numeric_data, use = "complete.obs")[, "price"]
# View correlations (excluding price-to-price which is always 1)
cor_with_price <- cor_with_price[names(cor_with_price) != "price"]
# Display result
print(cor_with_price)
## symboling normalized_losses wheel_base length
## -0.1633293 0.1999239 0.7347888 0.7603228
## width height curb_weight engine_size
## 0.8433157 0.2475002 0.8938095 0.8417248
## bore stroke compression_ratio horsepower
## 0.5348908 0.1587982 0.2109484 0.7585823
## peak_rpm city_mpg highway_mpg
## -0.1739701 -0.6900998 -0.7183144
# Separate numeric and categorical predictors
numeric_vars <- auto_data_clean %>%
select(where(is.numeric)) %>%
select(-price) %>%
names()
categorical_vars <- auto_data_clean %>%
select(where(is.factor)) %>%
names()
# Loop over each numeric variable to create scatter plots
for (var in numeric_vars) {
p <- ggplot(auto_data_clean, aes_string(x = var, y = "price")) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = paste("Scatter Plot: Price vs", var),
x = var, y = "Price") +
theme_minimal()
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 37 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 37 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Ensure categorical variables are treated as factors
auto_data_clean <- auto_data_clean %>%
mutate(across(where(is.character), as.factor))
# Define quantitative variables to include
quant_vars <- c("wheel_base", "length", "width", "curb_weight",
"engine_size", "bore", "horsepower", "city_mpg",
"highway_mpg", "peak_rpm")
# Identify qualitative variables (factors)
qual_vars <- auto_data_clean %>%
select(where(is.factor)) %>%
names()
# Identify qualitative variables (factors) and remove 'make' and 'fuel system'
qual_vars <- auto_data_clean %>%
select(where(is.factor)) %>%
names() %>%
setdiff(c("make", "fuel_system")) # exclude 'make'
# Build formula as a string
formula_str <- paste("price ~", paste(c(quant_vars, qual_vars), collapse = " + "))
# Fit the linear model
reg_model <- lm(as.formula(formula_str), data = auto_data_clean)
# View summary of the model
summary(reg_model)
##
## Call:
## lm(formula = as.formula(formula_str), data = auto_data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6256.9 -1057.8 -3.2 1303.3 11243.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.378e+04 1.431e+04 -3.059 0.002599 **
## wheel_base 1.121e+02 9.076e+01 1.235 0.218547
## length -1.264e+01 5.323e+01 -0.237 0.812630
## width 5.686e+02 2.554e+02 2.227 0.027333 *
## curb_weight 3.909e+00 1.822e+00 2.145 0.033401 *
## engine_size 5.805e+01 2.310e+01 2.513 0.012948 *
## bore -1.027e+03 1.702e+03 -0.603 0.547204
## horsepower 2.173e+01 2.286e+01 0.951 0.343198
## city_mpg -2.821e+02 1.566e+02 -1.802 0.073455 .
## highway_mpg 3.188e+02 1.463e+02 2.179 0.030731 *
## peak_rpm 1.764e+00 6.108e-01 2.889 0.004391 **
## fuel_typegas -1.062e+03 1.174e+03 -0.904 0.367167
## aspirationturbo 8.668e+02 8.759e+02 0.990 0.323821
## num_doorstwo 3.239e+02 6.058e+02 0.535 0.593611
## body_stylehardtop -4.278e+03 1.473e+03 -2.905 0.004186 **
## body_stylehatchback -4.111e+03 1.289e+03 -3.190 0.001709 **
## body_stylesedan -3.220e+03 1.429e+03 -2.254 0.025517 *
## body_stylewagon -4.277e+03 1.544e+03 -2.770 0.006255 **
## drive_wheelsfwd -1.243e+03 1.168e+03 -1.064 0.288737
## drive_wheelsrwd 5.084e+02 1.281e+03 0.397 0.691954
## engine_locationrear 7.521e+03 2.560e+03 2.937 0.003789 **
## engine_typel 1.261e+02 1.485e+03 0.085 0.932431
## engine_typeohc 2.964e+03 9.516e+02 3.115 0.002175 **
## engine_typeohcf 3.712e+03 1.543e+03 2.406 0.017233 *
## engine_typeohcv -4.544e+03 1.315e+03 -3.455 0.000703 ***
## num_cylindersfive -1.352e+04 2.821e+03 -4.793 3.68e-06 ***
## num_cylindersfour -1.584e+04 3.180e+03 -4.981 1.60e-06 ***
## num_cylinderssix -1.104e+04 2.258e+03 -4.887 2.43e-06 ***
## num_cylindersthree -7.567e+03 4.814e+03 -1.572 0.117915
## num_cylinderstwelve -8.380e+03 3.391e+03 -2.471 0.014488 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2410 on 163 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.9246, Adjusted R-squared: 0.9112
## F-statistic: 68.94 on 29 and 163 DF, p-value: < 2.2e-16
# Residual Standard Error (RSE) from the model summary
rse <- summary(reg_model)$sigma
# Mean of the actual response variable
mean_price <- mean(auto_data_clean$price, na.rm = TRUE)
# Coefficient of Variation (as a percentage)
cv <- (rse / mean_price) * 100
# Print the result
cat("Coefficient of Variation (CV):", round(cv, 2), "%\n")
## Coefficient of Variation (CV): 18.25 %
plot(reg_model, 1)
plot(reg_model, 2)
## Warning: not plotting observations with leverage one:
## 18, 46
plot(reg_model, 3)
## Warning: not plotting observations with leverage one:
## 18, 46
plot(reg_model, 4)
plot(reg_model, 5)
## Warning: not plotting observations with leverage one:
## 18, 46
plot(reg_model, 6)
## Warning: not plotting observations with leverage one:
## 18, 46
par(mfrow = c(2, 3))
plot(reg_model, 1:6)
## Warning: not plotting observations with leverage one:
## 18, 46
## Warning: not plotting observations with leverage one:
## 18, 46
# Create the same dataset used to fit the model
model_data <- model.frame(reg_model)
# Add predictions to this dataset
model_data$predicted_price <- predict(reg_model)
# Plot: Actual vs Predicted Prices
library(ggplot2)
ggplot(model_data, aes(x = predicted_price, y = price)) +
geom_point(color = "steelblue", alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Actual vs Predicted Prices",
x = "Predicted Price",
y = "Actual Price") +
theme_minimal()