The dataset contains a collection of clinical and biometric features used to predict the presence or absence of heart disease in patients. The primary goal of the dataset is to model and classify heart disease risk
| Feature Name | Data Type | Explanation |
|---|---|---|
Age |
Numeric (Integer) | The age of the patient (in years). |
Sex |
Categorical | The sex of the patient (M: Male, F: Female). |
ChestPainType |
Categorical | The type of chest pain experienced by the patient (\(\text{ATA, NAP, ASY, TA}\)). |
RestingBP |
Numeric (Integer) | Resting blood pressure (in mm \(\text{Hg}\)). |
Cholesterol |
Numeric (Integer) | Serum cholesterol level (in mg/dl). |
FastingBS |
Binary | Fasting blood sugar (1 if \(> 120\) mg/dl; 0 otherwise). |
RestingECG |
Categorical | Resting electrocardiogram results (\(\text{Normal, ST, LVH}\)). |
MaxHR |
Numeric (Integer) | Maximum heart rate achieved during a stress test. |
ExerciseAngina |
Binary | Exercise-induced angina (chest pain). (\(\text{Y}\): Yes, \(\text{N}\): No). |
Oldpeak |
Numeric (Float) | \(\text{ST}\) depression induced by exercise relative to rest. |
ST_Slope |
Categorical | The slope of the peak exercise \(\text{ST}\) segment (\(\text{Up, Flat, Down}\)). |
# Load the dataset
df <- read.csv("/Users/urielulloa/Desktop/Intermediate Stats Final/heart.csv")
# Convert the binary target variable to a factor
df$HeartDisease <- factor(df$HeartDisease,
levels = c(0, 1),
labels = c("Absent", "Present"))
# Convert key integer categorical variables to factors if they aren't already
# (FastingBS is an integer, but is truly binary/categorical)
df$FastingBS <- factor(df$FastingBS)
# Display the structure of the prepared data
str(df)## 'data.frame': 918 obs. of 12 variables:
## $ Age : int 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : chr "M" "F" "M" "F" ...
## $ ChestPainType : chr "ATA" "NAP" "ATA" "ASY" ...
## $ RestingBP : int 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : int 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ RestingECG : chr "Normal" "Normal" "ST" "Normal" ...
## $ MaxHR : int 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: chr "N" "N" "N" "Y" ...
## $ Oldpeak : num 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : chr "Up" "Flat" "Up" "Flat" ...
## $ HeartDisease : Factor w/ 2 levels "Absent","Present": 1 2 1 2 1 1 1 1 2 1 ...
set.seed(42) # For reproducibility
# Create index for stratified split
train_index <- createDataPartition(df$HeartDisease, p = 0.7, list = FALSE)
# Create the data subsets
train_df <- df[train_index, ]
test_df <- df[-train_index, ]
cat("Training set size:", nrow(train_df), "\n")## Training set size: 643
## Testing set size: 275
# Check the balance of the target variable in the training set
target_balance <- train_df %>%
count(HeartDisease) %>%
mutate(Proportion = n / sum(n))
print(target_balance)## HeartDisease n Proportion
## 1 Absent 287 0.4463453
## 2 Present 356 0.5536547
# Visualize the relationship between Age, MaxHR, and Heart Disease
train_df %>%
ggplot(aes(x = Age, y = MaxHR, color = HeartDisease)) +
geom_point(alpha = 0.6) +
stat_ellipse() +
labs(title = "Age vs. MaxHR by Heart Disease Status") +
theme_minimal()# Visualize Cholesterol distribution by disease status
train_df %>%
ggplot(aes(x = HeartDisease, y = Cholesterol)) +
geom_boxplot(aes(fill = HeartDisease)) +
labs(title = "Cholesterol Distribution") +
theme_minimal()# Visualize the proportion of Heart Disease cases across different Chest Pain Types
train_df %>%
group_by(ChestPainType) %>%
summarise(
Total = n(),
Prop_Present = mean(HeartDisease == "Present"),
.groups = 'drop'
) %>%
ggplot(aes(x = reorder(ChestPainType, Prop_Present), y = Prop_Present, fill = ChestPainType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(Prop_Present, accuracy = 0.1)), vjust = -0.5) +
labs(
title = "Heart Disease Prevalence by Chest Pain Type",
x = "Chest Pain Type",
y = "Proportion with Heart Disease"
) +
theme_minimal() +
theme(legend.position = "none")# Visualize the distribution of RestingBP by Heart Disease status
train_df %>%
ggplot(aes(x = RestingBP, fill = HeartDisease)) +
geom_density(alpha = 0.6) +
geom_vline(data = train_df %>% group_by(HeartDisease) %>% summarise(Mean_BP = mean(RestingBP)),
aes(xintercept = Mean_BP, color = HeartDisease), linetype = "dashed", size = 1) +
labs(
title = "Distribution of Resting Blood Pressure by Disease Status",
x = "Resting Blood Pressure (mm Hg)",
fill = "Heart Disease"
) +
theme_minimal()# install.packages("corrplot")
library(corrplot)
# Select only the continuous variables
continuous_data <- train_df %>% select(Age, RestingBP, Cholesterol, MaxHR, Oldpeak)
# Calculate the correlation matrix
cor_matrix <- cor(continuous_data)
# Create the correlation heatmap
corrplot(cor_matrix,
method = "circle",
type = "upper",
order = "hclust", # Hierarchical clustering to group similar variables
tl.col = "black",
tl.srt = 45,
addCoef.col = "black", # Add correlation coefficients to the plot
diag = FALSE,
title = "Correlation Matrix of Continuous Biometric Features")# --- Define Predictor Types Robustly ---
# Identify Continuous features (based on your dataset structure)
continuous_cols <- c("Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak")
# Identify Categorical features by excluding continuous columns and the target
categorical_cols <- colnames(train_df)[!colnames(train_df) %in% c(continuous_cols, "HeartDisease")]
# --- Step 1: One-Hot Encode (OHE) Categorical Variables ---
# Use dummyVars for OHE on the categorical features in the training set
# Note: fullRank = T prevents multicollinearity by dropping one level per factor
dmy_model <- dummyVars(~ ., data = train_df[, categorical_cols], fullRank = T)
# Apply OHE to both training and test sets
train_ohe <- data.frame(predict(dmy_model, train_df))
test_ohe <- data.frame(predict(dmy_model, test_df))
# --- Step 2: Combine and Standardize Continuous and OHE features ---
# Combine continuous features (from original data) with OHE features
train_predictors <- cbind(train_df[, continuous_cols], train_ohe)
test_predictors <- cbind(test_df[, continuous_cols], test_ohe)
# Create the preprocessing model (ONLY for centering/scaling) on the TRAINING predictors
# This model will learn the mean and standard deviation from the training data only.
preproc_model <- preProcess(train_predictors, method = c("center", "scale"))
# Apply the scaling to both train and test predictors
train_processed_predictors <- predict(preproc_model, train_predictors)
test_processed_predictors <- predict(preproc_model, test_predictors)
# Final processed dataframes (adding the target back)
train_processed <- data.frame(HeartDisease = train_df$HeartDisease, train_processed_predictors)
test_processed <- data.frame(HeartDisease = test_df$HeartDisease, test_processed_predictors)
cat("Corrected Processed Training Features (ready for modeling):\n")## Corrected Processed Training Features (ready for modeling):
## 'data.frame': 643 obs. of 16 variables:
## $ HeartDisease : Factor w/ 2 levels "Absent","Present": 1 1 2 1 1 1 1 2 1 1 ...
## $ Age : num -1.4239 -1.7443 -0.5694 0.0714 -1.5307 ...
## $ RestingBP : num 0.399 -0.133 0.293 0.932 -0.666 ...
## $ Cholesterol : num 0.831 0.77782 0.16627 -0.00212 1.27415 ...
## $ MaxHR : num 1.396 -1.551 -1.153 -0.595 1.316 ...
## $ Oldpeak : num -0.838 -0.838 0.643 -0.838 -0.838 ...
## $ SexM : num 0.503 0.503 -1.985 0.503 0.503 ...
## $ ChestPainTypeATA: num 2.212 2.212 -0.451 -0.451 -0.451 ...
## $ ChestPainTypeNAP: num -0.549 -0.549 -0.549 1.819 1.819 ...
## $ ChestPainTypeTA : num -0.24 -0.24 -0.24 -0.24 -0.24 ...
## $ FastingBS.1 : num -0.577 -0.577 -0.577 -0.577 -0.577 ...
## $ RestingECGNormal: num 0.802 -1.245 0.802 0.802 0.802 ...
## $ RestingECGST : num -0.493 2.024 -0.493 -0.493 -0.493 ...
## $ ExerciseAnginaY : num -0.807 -0.807 1.237 -0.807 -0.807 ...
## $ ST_SlopeFlat : num -0.998 -0.998 1.001 -0.998 -0.998 ...
## $ ST_SlopeUp : num 1.16 1.16 -0.861 1.16 1.16 ...
# Remove the target variable for PCA
train_predictors_pca <- train_processed %>% select(-HeartDisease)
# Perform PCA (center and scale already done in preprocessing)
pr.out <- prcomp(train_predictors_pca, center = FALSE, scale. = FALSE)
# Calculate Variance Explained
pr.var <- pr.out$sdev^2
pve <- pr.var / sum(pr.var)
cumulative_pve <- cumsum(pve)
# Create a table showing the Variance Explained metrics
variance_table <- data.frame(
PC = 1:length(pve),
PVE = pve,
Cumulative_PVE = cumulative_pve
)
cat("\n--- Variance Explained Table ---\n")##
## --- Variance Explained Table ---
# Print the table, rounding values to 4 decimal places for readability
print(round(variance_table, 4))## PC PVE Cumulative_PVE
## 1 1 0.2246 0.2246
## 2 2 0.1113 0.3359
## 3 3 0.0893 0.4252
## 4 4 0.0834 0.5086
## 5 5 0.0754 0.5839
## 6 6 0.0712 0.6551
## 7 7 0.0638 0.7189
## 8 8 0.0565 0.7754
## 9 9 0.0515 0.8270
## 10 10 0.0454 0.8723
## 11 11 0.0388 0.9112
## 12 12 0.0315 0.9427
## 13 13 0.0284 0.9710
## 14 14 0.0216 0.9926
## 15 15 0.0074 1.0000
# Determine k (number of PCs) for >= 80% variance
k <- which(cumulative_pve >= 0.8)[1]
cat(paste("Optimal number of PCs to explain >= 80% variance:", k, "\n"))## Optimal number of PCs to explain >= 80% variance: 9
# Visualize PVE (Scree Plot)
par(mfrow = c(1, 2))
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained",
type = "b", main = "Scree Plot")
plot(cumulative_pve, xlab = "Principal Component", ylab = "Cumulative PVE",
type = "b", main = "Cumulative PVE")par(mfrow = c(1, 1))
# --- NEW CODE ADDITIONS START HERE ---
cat("\n--- Loadings (Feature Weights) for Top", k, "PCs ---\n")##
## --- Loadings (Feature Weights) for Top 9 PCs ---
# Display the rotation matrix for the top k PCs
# This shows the weights of the original features on each PC
print(pr.out$rotation[, 1:k])## PC1 PC2 PC3 PC4
## Age 0.293172714 -0.18109997 -0.16886993 0.0409816437
## RestingBP 0.126448601 -0.09981633 -0.44874276 0.0007100293
## Cholesterol -0.154618553 0.16389636 -0.59826971 0.1687627343
## MaxHR -0.342580068 -0.06114498 -0.08773179 0.1894851551
## Oldpeak 0.305840943 0.12472792 -0.21981248 0.0795940442
## SexM 0.169527695 -0.08481208 0.35834028 -0.1882127875
## ChestPainTypeATA -0.250660760 -0.01853062 -0.23386706 -0.5358749630
## ChestPainTypeNAP -0.107365934 -0.04759085 0.17955872 0.7510968317
## ChestPainTypeTA -0.002726991 -0.10650334 -0.13903416 -0.0269840943
## FastingBS.1 0.200312212 -0.22228523 0.25129421 -0.1099416712
## RestingECGNormal -0.157175123 0.58493615 0.21697542 -0.1235344768
## RestingECGST 0.177616426 -0.57526203 -0.04435246 -0.0142811950
## ExerciseAnginaY 0.351144962 0.16051562 -0.08251304 -0.0546723660
## ST_SlopeFlat 0.382196216 0.27330943 -0.03707859 0.0686419469
## ST_SlopeUp -0.440902669 -0.25881781 0.03348088 -0.0698999861
## PC5 PC6 PC7 PC8 PC9
## Age -0.222354083 0.39270969 0.038708779 0.311257235 -0.03570549
## RestingBP -0.286359454 0.46924785 -0.008919593 -0.285019567 -0.30986906
## Cholesterol 0.119385366 -0.11583202 -0.064193292 -0.107268554 -0.20992870
## MaxHR -0.087562943 -0.23757927 0.060496633 -0.542235876 0.15171129
## Oldpeak 0.043287301 -0.01517564 -0.282736544 -0.303385801 0.57001539
## SexM -0.005788211 -0.04094093 -0.521507783 -0.425156670 -0.51774513
## ChestPainTypeATA 0.189039398 0.03524818 0.278977820 -0.151813513 -0.07140259
## ChestPainTypeNAP 0.055523937 0.18436972 0.074462489 -0.083630611 -0.13346167
## ChestPainTypeTA -0.711544728 -0.46494968 -0.231282753 0.184389803 0.03515821
## FastingBS.1 -0.313602414 0.15122075 0.460731777 -0.408829433 0.24706838
## RestingECGNormal -0.184184526 0.23891187 -0.009694788 0.002553818 0.02612969
## RestingECGST 0.318620193 -0.19053650 0.031168870 0.048122379 -0.01143622
## ExerciseAnginaY 0.254790008 0.12511133 -0.290561931 -0.050343759 0.21827402
## ST_SlopeFlat 0.025508036 -0.31811730 0.371098891 -0.032204562 -0.30878210
## ST_SlopeUp -0.016029516 0.26325477 -0.257655294 0.080491701 0.11683170
# Optional: Biplot Visualization (Shows feature direction and observation scores)
# Note: A biplot often works best with the first two PCs
biplot(
pr.out,
choices = 1:2, # Focus on PC1 and PC2
cex = c(0.7, 1.0), # Size of text/arrows
col = c("gray40", "indianred3"), # Color for points and arrows
main = "Biplot of PC1 and PC2"
)# Extract and Print Top Contributing Features for PC1 and PC2
get_top_loadings <- function(pc_num, loadings_matrix, n_features = 3) {
# Get absolute values of loadings for the specified PC
pc_loadings <- loadings_matrix[, pc_num]
abs_loadings <- abs(pc_loadings)
# Sort and select the top n features
top_indices <- head(order(abs_loadings, decreasing = TRUE), n_features)
# Return the signed loadings for the top features
return(sort(pc_loadings[top_indices], decreasing = TRUE))
}
cat("\n--- Top 3 Features Defining PC1 ---\n")##
## --- Top 3 Features Defining PC1 ---
## ST_SlopeFlat ExerciseAnginaY ST_SlopeUp
## 0.3821962 0.3511450 -0.4409027
##
## --- Top 3 Features Defining PC2 ---
## RestingECGNormal ST_SlopeFlat RestingECGST
## 0.5849362 0.2733094 -0.5752620
# Extract the top k PC scores for the train and test sets
train_PC_scores <- data.frame(pr.out$x[, 1:k])
test_PC_scores <- predict(pr.out, newdata = test_processed %>% select(-HeartDisease))[, 1:k]
# Add PC scores back to the processed dataframes for modeling
train_pca_model_df <- data.frame(HeartDisease = train_processed$HeartDisease, train_PC_scores)
test_pca_model_df <- data.frame(HeartDisease = test_processed$HeartDisease, test_PC_scores)The loadings tell us what clinical concepts these new factors represent:
# Use all processed features (excluding the target)
full_formula <- as.formula(paste("HeartDisease ~ .", sep = ""))
outglm_benchmark <- glm(full_formula,
data = train_processed,
family = binomial(link = "logit"))
summary(outglm_benchmark)##
## Call:
## glm(formula = full_formula, family = binomial(link = "logit"),
## data = train_processed)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.36095 0.12650 2.853 0.004326 **
## Age 0.03905 0.14343 0.272 0.785410
## RestingBP 0.13316 0.12926 1.030 0.302913
## Cholesterol -0.46605 0.13961 -3.338 0.000843 ***
## MaxHR -0.08710 0.14599 -0.597 0.550755
## Oldpeak 0.30460 0.14796 2.059 0.039523 *
## SexM 0.62411 0.13578 4.596 4.30e-06 ***
## ChestPainTypeATA -0.65475 0.13924 -4.702 2.57e-06 ***
## ChestPainTypeNAP -0.84360 0.13418 -6.287 3.23e-10 ***
## ChestPainTypeTA -0.39503 0.11366 -3.476 0.000510 ***
## FastingBS.1 0.49687 0.13771 3.608 0.000308 ***
## RestingECGNormal -0.12647 0.15432 -0.820 0.412477
## RestingECGST -0.14691 0.16261 -0.903 0.366305
## ExerciseAnginaY 0.46787 0.14146 3.307 0.000942 ***
## ST_SlopeFlat 0.81807 0.24230 3.376 0.000735 ***
## ST_SlopeUp -0.34165 0.25641 -1.332 0.182718
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 883.97 on 642 degrees of freedom
## Residual deviance: 435.59 on 627 degrees of freedom
## AIC: 467.59
##
## Number of Fisher Scoring iterations: 5
##
## --- VIF Results for Benchmark Model ---
## Age RestingBP Cholesterol MaxHR
## 1.345931 1.157429 1.219408 1.350533
## Oldpeak SexM ChestPainTypeATA ChestPainTypeNAP
## 1.343939 1.097106 1.165829 1.223630
## ChestPainTypeTA FastingBS.1 RestingECGNormal RestingECGST
## 1.204989 1.108859 1.614899 1.643553
## ExerciseAnginaY ST_SlopeFlat ST_SlopeUp
## 1.259304 3.917375 4.418239
The model confirmed that the risk of HeartDisease is primarily driven by three groups of factors (p<0.001):
# Use only the top k PC scores
pca_formula <- as.formula(paste("HeartDisease ~ .", sep = ""))
outglm_pca <- glm(pca_formula,
data = train_pca_model_df,
family = binomial(link = "logit"))
summary(outglm_pca)##
## Call:
## glm(formula = pca_formula, family = binomial(link = "logit"),
## data = train_pca_model_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.37508 0.12062 3.110 0.00187 **
## PC1 1.21969 0.08861 13.764 < 2e-16 ***
## PC2 0.27253 0.09190 2.966 0.00302 **
## PC3 0.42491 0.10579 4.017 5.91e-05 ***
## PC4 -0.43763 0.10981 -3.985 6.74e-05 ***
## PC5 0.02717 0.10090 0.269 0.78773
## PC6 -0.06280 0.10710 -0.586 0.55760
## PC7 -0.09264 0.11594 -0.799 0.42426
## PC8 -0.41055 0.12763 -3.217 0.00130 **
## PC9 0.01113 0.12812 0.087 0.93078
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 883.97 on 642 degrees of freedom
## Residual deviance: 466.04 on 633 degrees of freedom
## AIC: 486.04
##
## Number of Fisher Scoring iterations: 5
##
## --- VIF Results for PCA-Enhanced Model (Should be ~1.0) ---
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 1.146277 1.066685 1.068337 1.076519 1.030764 1.036434 1.053649 1.047644
## PC9
## 1.010517
# Fit the full tree (cp=0 ensures max complexity for later pruning)
tree.heart <- rpart(full_formula,
data = train_processed,
method = 'class',
cp = 0)
# Pruning: Find the optimal complexity parameter (CP)
CPtable <- tree.heart$cptable
cp.opt <- CPtable[which.min(CPtable[, 'xerror']), 'CP']
# Prune the tree using the optimal cp
pruned.tree.heart.opt <- prune(tree.heart, cp = cp.opt)
# Visualize the Optimal Tree (requires RColorBrewer and rattle)
fancyRpartPlot(pruned.tree.heart.opt,
main = paste("Pruned Decision Tree (CP=", round(cp.opt, 4), ")", sep=""),
sub = paste("Final nodes:", sum(pruned.tree.heart.opt$frame$var == '<leaf>')))##
## Top Split Rule (Most Important Predictor):
## count ncat improve index adj
## 643.0000000 1.0000000 109.3041428 0.1492682 0.0000000
# Ensure factoextra is loaded for visualization
library(factoextra)
# --- Define Data for Clustering (Full Processed Predictors) ---
# Use the predictors already standardized and OHE (excluding the target)
data_for_clustering <- train_processed %>% select(-HeartDisease)
# Step 1: Determine optimal number of clusters (k) using the Elbow Method
# We apply the method to the full set of standardized predictors
set.seed(42)
fviz_nbclust(data_for_clustering, kmeans, method = "wss", k.max = 10) +
labs(title = "Elbow Method for K-Means Clustering (Full Profile)")# Based on the plot (you will determine k here), let's select k=4 for demonstration
# NOTE: The optimal k might change when using the full feature set!
k_optimal <- 4
# Step 2: Apply K-Means Clustering
# Use a high nstart for stability, as per your class example
set.seed(42)
kmeans_out <- kmeans(data_for_clustering, centers = k_optimal, nstart = 25)
# Step 3: Visualize the Clusters
# The visualization will project the high-dimensional clusters onto PC1 and PC2 for plotting.
fviz_cluster(kmeans_out, data = data_for_clustering,
geom = "point",
ellipse.type = "convex",
palette = "jco",
main = paste("Patient Clusters (k=", k_optimal, ") on Full Standardized Profile"))# Step 4: Attach Cluster Labels and Analyze Prevalence
# Attach the derived cluster labels to the original data
train_df_clustered <- train_df %>%
mutate(Cluster = as.factor(kmeans_out$cluster))
# Characterize the clusters by the MEAN of the key RAW (unscaled) features
# This provides the clinical interpretation of each segment.
cluster_analysis <- train_df_clustered %>%
group_by(Cluster) %>%
summarise(
N = n(),
# Calculate prevalence rate for each cluster
Prevalence_Rate = mean(HeartDisease == "Present"),
# Analyze the raw biometric means to interpret the cluster profile
Mean_Age = mean(Age),
Mean_RestingBP = mean(RestingBP),
Mean_Cholesterol = mean(Cholesterol),
Mean_MaxHR = mean(MaxHR),
# Add key categorical feature breakdown for interpretation (e.g., proportion of Males)
Prop_Male = mean(Sex == "M"),
.groups = 'drop'
)
cat("\n--- Cluster Analysis: Demographic Profile and Disease Prevalence ---\n")##
## --- Cluster Analysis: Demographic Profile and Disease Prevalence ---
## # A tibble: 4 × 8
## Cluster N Prevalence_Rate Mean_Age Mean_RestingBP Mean_Cholesterol
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 245 0.155 49.1 130. 229.
## 2 2 262 0.840 54.9 133. 183.
## 3 3 35 0.4 54.9 137. 201.
## 4 4 101 0.832 58.8 136. 143.
## # ℹ 2 more variables: Mean_MaxHR <dbl>, Prop_Male <dbl>
library(pROC)
library(caret)
# NOTE: Ensure HeartDisease is the same factor/numeric in both test sets.
# We define the single response variable for both AUC calculations.
test_response <- test_processed$HeartDisease
# Predict probabilities on the test set
prob_benchmark <- predict(outglm_benchmark, newdata = test_processed, type = "response")
prob_pca <- predict(outglm_pca, newdata = test_pca_model_df, type = "response")
# 1. Calculate ROC objects (CRITICAL: set plot = FALSE to suppress automatic printing)
auc_benchmark <- roc(response = test_response, predictor = prob_benchmark, plot = FALSE)
auc_pca <- roc(response = test_response, predictor = prob_pca, plot = FALSE)
# 2. Plot the first ROC curve (Benchmark)
# Set up plot area and plot first curve. We will not print AUC here.
plot(auc_benchmark, main = "ROC Curve Comparison", col = 'black', lwd = 2,
print.auc = FALSE # <-- Suppress AUC print for clean plotting
)
# 3. Add the second ROC curve (PCA-Enhanced)
# Add second curve to the existing plot.
plot(auc_pca, add = TRUE, col = 'blue', lwd = 2,
print.auc = FALSE # <-- Suppress AUC print
)
# 4. Add Custom Legend with AUC values (The Fix!)
# This ensures both AUC values are visible and clearly labelled.
legend_labels <- c(
paste("Benchmark (AUC =", round(auc_benchmark$auc, 3), ")"),
paste("PCA-Enhanced (AUC =", round(auc_pca$auc, 3), ")")
)
legend("bottomright",
legend = legend_labels,
col = c("black", "blue"),
lwd = 2,
bty = "n") # bty="n" removes the box around the legend for a cleaner look# --- Confusion Matrices (using 0.5 threshold) ---
# Ensure the reference factor is consistent for the confusion matrix
test_response_factor <- factor(test_response, levels = c("Absent", "Present"))
pred_benchmark <- factor(ifelse(prob_benchmark > 0.5, "Present", "Absent"), levels = c("Absent", "Present"))
conf_matrix_benchmark <- confusionMatrix(pred_benchmark, test_response_factor, positive = "Present")
cat("\n--- Benchmark Model Confusion Matrix (Raw Features) ---\n")##
## --- Benchmark Model Confusion Matrix (Raw Features) ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction Absent Present
## Absent 107 15
## Present 16 137
##
## Accuracy : 0.8873
## 95% CI : (0.8438, 0.9221)
## No Information Rate : 0.5527
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7718
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9013
## Specificity : 0.8699
## Pos Pred Value : 0.8954
## Neg Pred Value : 0.8770
## Prevalence : 0.5527
## Detection Rate : 0.4982
## Detection Prevalence : 0.5564
## Balanced Accuracy : 0.8856
##
## 'Positive' Class : Present
##
##
## Benchmark Model AUC: 0.94
####The Final Choice: We concluded that the PCA-Enhanced model is the superior choice for implementation. Why? Because it achieved that high performance (AUC≈0.94−0.95) using a smaller set of uncorrelated Principal Components. This makes the model inherently more stable, more efficient, and less susceptible to the multicollinearity issues often found in raw medical features. It’s the cleaner, smarter version of the classifier.”
# Predict classes on the test set
pred_tree <- predict(pruned.tree.heart.opt, newdata = test_processed, type = "class")
# Confusion Matrix
conf_matrix_tree <- confusionMatrix(pred_tree, test_processed$HeartDisease, positive = "Present")
cat("\n--- Pruned Decision Tree Confusion Matrix ---\n")##
## --- Pruned Decision Tree Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction Absent Present
## Absent 102 24
## Present 21 128
##
## Accuracy : 0.8364
## 95% CI : (0.7872, 0.8781)
## No Information Rate : 0.5527
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6698
##
## Mcnemar's Test P-Value : 0.7656
##
## Sensitivity : 0.8421
## Specificity : 0.8293
## Pos Pred Value : 0.8591
## Neg Pred Value : 0.8095
## Prevalence : 0.5527
## Detection Rate : 0.4655
## Detection Prevalence : 0.5418
## Balanced Accuracy : 0.8357
##
## 'Positive' Class : Present
##
# Calculate AUC for the tree (use probability predictions)
prob_tree <- predict(pruned.tree.heart.opt, newdata = test_processed, type = "prob")[, "Present"]
auc_tree <- roc(test_processed$HeartDisease, prob_tree, plot = FALSE)
cat("\nDecision Tree AUC:", round(auc_tree$auc, 3), "\n")##
## Decision Tree AUC: 0.902