import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#   File Path  
file_path = '/Users/urielulloa/Desktop/Machine_Learning_Final/ML_clean_data.xlsx'

#   1. Load Data  
df = pd.read_excel('/Users/urielulloa/Desktop/Machine_Learning_Final/ML_clean_data.xlsx')


#   2. Define Feature Lists (Verified)  
numerical_features = [
    'away_dk_pct', 'home_dk_pct', 'espn_favor_mag', 'away_spread',
    'home_spread', 'away_vig', 'home_vig'
]
categorical_features = [
    'espn_favors', 'dk_favors'
]
all_features = numerical_features + categorical_features

# 3. Define Y and X (raw)  
Y = df['edge_paid_out']
X = df[all_features]


#   Output Summary  
# (This is the output from the code execution)
print("  Data Preparation Summary  ")
print(f"Dataset Shape: {df.shape}")
print(f"Target Variable (Y) Value Counts (Edge Paid Out):")
print(Y.value_counts())
print("\nFeature Set (X) Head:")
print(X.head())

print("\n  Feature Grouping for Pipeline  ")
print(f"Numerical Features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

  Data Preparation Summary  
Dataset Shape: (195, 24)
Target Variable (Y) Value Counts (Edge Paid Out):
edge_paid_out
0    164
1     31
Name: count, dtype: int64

Feature Set (X) Head:
   away_dk_pct  home_dk_pct  espn_favor_mag  away_spread  home_spread  \
0     0.259128     0.740872       -0.126872          7.5         -7.5   
1     0.087558     0.912442       -0.024442         15.5        -15.5   
2     0.482636     0.517364       -0.003364          1.5         -1.5   
3     0.551441     0.448559        0.223441         -1.5          1.5   
4     0.700162     0.299838        0.133162         -6.5          6.5   

   away_vig  home_vig espn_favors dk_favors  
0      -108      -112        away      home  
1      -105      -115        away      home  
2      -118      -102        away      home  
3      -120       100        home      away  
4      -110      -110        home      away  

  Feature Grouping for Pipeline  
Numerical Features (7): ['away_dk_pct', 'home_dk_pct', 'espn_favor_mag', 'away_spread', 'home_spread', 'away_vig', 'home_vig']
Categorical Features (2): ['espn_favors', 'dk_favors']

#   DEBUGGING CODE BLOCK  

# 1. Combine all features we expect
all_expected_features = numerical_features + categorical_features

# 2. Get the actual columns from your raw DataFrame (df)
actual_columns = df.columns.tolist()

# 3. Check for missing columns
missing_columns = [col for col in all_expected_features if col not in actual_columns]

if missing_columns:
    print(f"ERROR: The following required columns are MISSING from your DataFrame:")
    for col in missing_columns:
        print(f"  -> '{col}'")
    print("\nAction: Please check the spelling/casing in your feature lists or your Excel file ('ML_clean_data.xlsx') columns.")
else:
    print("SUCCESS: All feature names match the columns in the DataFrame.")
    print("The error might be due to a recent change in X_train. Proceed with the Model & Tuning code.")

# 4. Optional: Check for white space (a very common mistake)
for col in actual_columns:
    if col != col.strip():
        print(f"WARNING: Column '{col}' has leading/trailing spaces. Consider renaming it.")

SUCCESS: All feature names match the columns in the DataFrame.
The error might be due to a recent change in X_train. Proceed with the Model & Tuning code.

#   4. Perform the Train-Test Split (with stratification)  
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)

print("  Train-Test Split Summary  ")
print(f"X_train (Features for Training): {X_train.shape}")
print(f"X_test (Features for Testing): {X_test.shape}")
print(f"Y_train (Target for Training): {Y_train.shape}")
print(f"Y_test (Target for Testing): {Y_test.shape}")

  Train-Test Split Summary  
X_train (Features for Training): (156, 9)
X_test (Features for Testing): (39, 9)
Y_train (Target for Training): (156,)
Y_test (Target for Testing): (39,)

print("## 3. Exploratory Data Analysis (EDA)")

# 1. Check Data Types, Missing Values, and Non-Null Counts
print("\n### 3.1 Data Information (Types and Missing Values)")
# This command is essential to quickly check for non-numeric columns that might be missed
# and to confirm if any column has missing (NaN) values that require imputation.
print(df.info())

# 2. Descriptive Statistics for Numerical Features
print("\n### 3.2 Descriptive Statistics")
# Look for:
# - Range (min/max): Are the values reasonable?
# - Mean/Std Dev: Helps understand data spread and if scaling is necessary (it is).
print(df[numerical_features].describe().T)

# 3. Analyze Categorical Feature Distributions
print("\n### 3.3 Categorical Feature Distributions")
for col in categorical_features:
    print(f"\n--- {col} Counts ---")
    # Check for categories that are very rare (which might be grouped or dropped).
    print(df[col].value_counts(normalize=True))

# 4. Critical Check: Target Variable Imbalance
print("\n### 3.4 Target Variable Imbalance Check")
target_counts = df['edge_paid_out'].value_counts(normalize=True)
print(target_counts)

# Actionable Insight: If the proportion of '1's (Edge Paid Out) is small (e.g., <10-20%),
# you have a class imbalance problem. You may need to use techniques like SMOTE or
# stratified cross-validation (which we already planned for the split)
# and focus on metrics like ROC AUC and F1-score instead of just Accuracy.

# Visualization Suggestion (Requires matplotlib/seaborn) 
# Import this for your notebook:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
sns.countplot(x='edge_paid_out', data=df)
plt.title('Target Class Distribution')
plt.show()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Data Information (Types and Missing Values)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   game_id               195 non-null    int64  
 1   game_url              195 non-null    object 
 2   league                195 non-null    object 
 3   game_date             195 non-null    int64  
 4   winner                195 non-null    object 
 5   away_dk_implied_prob  195 non-null    float64
 6   home_dk_implied_prob  195 non-null    float64
 7   away_dk_pct           195 non-null    float64
 8   home_dk_pct           195 non-null    float64
 9   margin_victory        195 non-null    int64  
 10  away_spread_covered   195 non-null    int64  
 11  home_spread_covered   195 non-null    int64  
 12  espn_favors           195 non-null    object 
 13  espn_favor_mag        195 non-null    float64
 14  espn_won_ML           195 non-null    int64  
 15  espn_ML_winnings      195 non-null    float64
 16  espn_spread_winnings  195 non-null    float64
 17  away_spread           195 non-null    float64
 18  home_spread           195 non-null    float64
 19  away_vig              195 non-null    int64  
 20  home_vig              195 non-null    int64  
 21  dk_favors             195 non-null    object 
 22  model_disagreement    195 non-null    int64  
 23  edge_paid_out         195 non-null    int64  
dtypes: float64(9), int64(10), object(5)
memory usage: 36.7+ KB
None

### 3.2 Descriptive Statistics
                count        mean        std         min         25%  \
away_dk_pct     195.0    0.369871   0.218352    0.027158    0.193775   
home_dk_pct     195.0    0.630129   0.218352    0.127907    0.473913   
espn_favor_mag  195.0    0.011625   0.074793   -0.219069   -0.029194   
away_spread     195.0    4.900000   8.490990  -12.500000   -1.500000   
home_spread     195.0   -4.900000   8.490990  -29.500000  -10.500000   
away_vig        195.0 -102.794872  39.793656 -125.000000 -115.000000   
home_vig        195.0 -105.923077  30.433961 -125.000000 -115.000000   

                       50%         75%         max  
away_dk_pct       0.330993    0.526087    0.872093  
home_dk_pct       0.669007    0.806225    0.972842  
espn_favor_mag    0.012158    0.054921    0.223441  
away_spread       5.500000   10.500000   29.500000  
home_spread      -5.500000    1.500000   12.500000  
away_vig       -110.000000 -105.000000  105.000000  
home_vig       -110.000000 -105.000000  105.000000  

### 3.3 Categorical Feature Distributions

--- espn_favors Counts ---
espn_favors
home    0.579487
away    0.420513
Name: proportion, dtype: float64

--- dk_favors Counts ---
dk_favors
home    0.707692
away    0.292308
Name: proportion, dtype: float64

### 3.4 Target Variable Imbalance Check
edge_paid_out
0    0.841026
1    0.158974
Name: proportion, dtype: float64

#  5. Define Preprocessing Pipeline 
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    # drop='first' is essential to avoid multicollinearity
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

#  6. Create the Final Model Pipeline (Preprocessor + Classifier) 
# NOTE: We now include class_weight='balanced' in the model definition to help with imbalanced data.
lr = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
final_ml_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lr)
])

#   7. Define Hyperparameter Search Grid 
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

#  8. Define Grid Search and Fit 
lr_grid_search = GridSearchCV(
    final_ml_pipeline,
    param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

print("--- Starting Logistic Regression Grid Search ---")
# Ensure X_train and Y_train are defined from a previous cell.
lr_grid_search.fit(X_train, Y_train)


#  9. Output results and Define Model Variable (FIXED)
# FIX 1: Explicitly define the variable needed for plotting.
best_lr_model = lr_grid_search.best_estimator_

print("\n--- Grid Search Results (Successfully fitted) ---")
print(f"Best Cross-Validation ROC AUC Score: {lr_grid_search.best_score_:.4f}")
print("Best Parameters:")
# FIX 2: Correct the variable name in the printing loop from 'grid_search' to 'lr_grid_search'
for param, value in lr_grid_search.best_params_.items():
    print(f"  {param}: {value}")

print("\nX_test and Y_test objects are ready for Step 6 (Evaluation).")

--- Starting Logistic Regression Grid Search ---
Fitting 5 folds for each of 10 candidates, totalling 50 fits

--- Grid Search Results (Successfully fitted) ---
Best Cross-Validation ROC AUC Score: 0.6869
Best Parameters:
  classifier__C: 1
  classifier__penalty: l1

X_test and Y_test objects are ready for Step 6 (Evaluation).

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # Added import for DataFrame creation
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

# Retrieve the best model from the Grid Search
# NOTE: This assumes lr_grid_search was run and class_weight='balanced' was used.
best_lr_model = lr_grid_search.best_estimator_

#  6.1 Performance Metrics 
print("\n### 6.1 Model Performance on Test Set (Logistic Regression)")

# Make predictions on the unseen Test Set
Y_pred_proba = best_lr_model.predict_proba(X_test)[:, 1]

# Use a standard 0.5 classification threshold for the initial report
Y_pred = (Y_pred_proba >= 0.5).astype(int)

# 1. ROC AUC Score (The primary metric for imbalanced data)
roc_auc = roc_auc_score(Y_test, Y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

# 2. Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

# 3. Confusion Matrix Visualization
cm = confusion_matrix(Y_test, Y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Edge (0)', 'Edge Paid (1)'],
            yticklabels=['No Edge (0)', 'Edge Paid (1)'])
plt.title(f'Confusion Matrix (ROC AUC: {roc_auc:.4f})')
plt.ylabel('True label')
plt.xlabel('Predicted label')

# Keep the savefig line, but move it before the display
plt.savefig('lr_confusion_matrix.png') # Added 'lr_' prefix for clarity


#  6.2 Feature Interpretation (Logistic Regression Coefficients) 
print("\n### 6.2 Feature Interpretation (Logistic Regression Coefficients)")

# 1. Get feature names (including one-hot encoded ones)
# FIXED: Replaced 'best_model' with 'best_lr_model'
feature_names = best_lr_model.named_steps['preprocessor'].get_feature_names_out()

# 2. Get coefficients from the classifier step
# FIXED: Replaced 'best_model' with 'best_lr_model'
coefficients = best_lr_model.named_steps['classifier'].coef_[0]

# 3. Create a DataFrame for easy viewing and sorting
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})
coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient')

print("\nTop 10 Most Influential Features:")
# NOTE: If your data is too small (like the sample data used earlier), this might fail or be empty.
# Assuming your full ML_clean_data.xlsx - Sheet1.csv file is loaded correctly.
print(coef_df.head(10).to_markdown(index=False))

### 6.1 Model Performance on Test Set (Logistic Regression)
ROC AUC Score: 0.7626

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.70      0.78        33
           1       0.23      0.50      0.32         6

    accuracy                           0.67        39
   macro avg       0.56      0.60      0.55        39
weighted avg       0.78      0.67      0.71        39


### 6.2 Feature Interpretation (Logistic Regression Coefficients)

Top 10 Most Influential Features:
| Feature               |   Coefficient |
|:----------------------|--------------:|
| cat__espn_favors_home |    -1.11484   |
| num__home_spread      |     0.473622  |
| num__away_vig         |     0.365418  |
| num__home_vig         |     0.116031  |
| num__away_spread      |    -0.050906  |
| num__espn_favor_mag   |    -0.0181921 |
| num__away_dk_pct      |     0         |
| num__home_dk_pct      |     0         |
| cat__dk_favors_home   |     0         |

from sklearn.ensemble import RandomForestClassifier
# NOTE: Pipeline and preprocessor are assumed to be defined from prior cells.

#  4. CREATE RANDOM FOREST PIPELINE 
# Using class_weight='balanced' to address the imbalance in the 'edge_paid_out' target.
rf = RandomForestClassifier(random_state=42, class_weight='balanced') 
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

#  5. DEFINE HYPERPARAMETER SEARCH GRID 
# Tuning the number of trees (n_estimators), depth (max_depth), and split criteria
rf_param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [5, 10]
}

#  6. DEFINE GRID SEARCH AND FIT 
rf_grid_search = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=5,
    scoring='roc_auc', 
    verbose=2, # Increased verbosity for more output
    n_jobs=-1
)

print("--- Starting Random Forest Grid Search (8. Model & Tuning) ---")
# Ensure X_train and Y_train are defined from a previous cell.
rf_grid_search.fit(X_train, Y_train)

#  7. OUTPUT TUNING RESULTS AND DEFINE MODEL VARIABLE (FIXED)
# FIX: Explicitly define the variable needed for plotting.
best_rf_model = rf_grid_search.best_estimator_

print("\n--- Random Forest Grid Search Results ---")
print(f"Best Cross-Validation ROC AUC Score: {rf_grid_search.best_score_:.4f}")
print("Best Parameters:")
for param, value in rf_grid_search.best_params_.items():
    print(f"  {param}: {value}")

--- Starting Random Forest Grid Search (8. Model & Tuning) ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.6s

[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.7s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.7s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.7s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.5s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.7s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.3s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=10, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.6s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=200; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=5, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=200; total time=   0.3s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.5s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.4s
[CV] END classifier__max_depth=15, classifier__min_samples_split=10, classifier__n_estimators=300; total time=   0.4s

--- Random Forest Grid Search Results ---
Best Cross-Validation ROC AUC Score: 0.7808
Best Parameters:
  classifier__max_depth: 5
  classifier__min_samples_split: 10
  classifier__n_estimators: 300

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # <-- ADDED: Necessary for pd.DataFrame
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

#  1. RETRIEVE BEST MODEL 
# NOTE: This assumes the previous cell (Random Forest Grid Search) was run successfully 
# and defined 'rf_grid_search' and 'X_test'/'Y_test'.
best_rf_model = rf_grid_search.best_estimator_

#  2. PERFORMANCE METRICS --
print("\n### 9.1 Random Forest Performance on Test Set")

# Predict probabilities on the unseen Test Set
Y_pred_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]

# Use a standard 0.5 classification threshold for the report
Y_pred_rf = (Y_pred_proba_rf >= 0.5).astype(int)

# Calculate ROC AUC Score
roc_auc_rf = roc_auc_score(Y_test, Y_pred_proba_rf)
print(f"Random Forest Test ROC AUC Score: {roc_auc_rf:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred_rf))

#  3. CONFUSION MATRIX VISUALIZATION 
cm_rf = confusion_matrix(Y_test, Y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Reds',
            xticklabels=['No Edge (0)', 'Edge Paid (1)'],
            yticklabels=['No Edge (0)', 'Edge Paid (1)'])
plt.title(f'RF Confusion Matrix (ROC AUC: {roc_auc_rf:.4f})')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig('rf_confusion_matrix.png')


#  4. FEATURE IMPORTANCE ANALYSIS 
print("\n### 9.2 Random Forest Feature Importance")

# Get feature names (including one-hot encoded ones)
feature_names_rf = best_rf_model.named_steps['preprocessor'].get_feature_names_out()

# Get feature importances (Gini importance)
importances = best_rf_model.named_steps['classifier'].feature_importances_

# Create a DataFrame for easy viewing and sorting
feature_importance_df = pd.DataFrame({
    'Feature': feature_names_rf,
    'Importance': importances
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nTop 10 Most Influential Features (Random Forest):")
print(feature_importance_df.head(10).to_markdown(index=False))

#  5. FEATURE IMPORTANCE PLOT 
plt.figure(figsize=(8, 6))
# Using the correct variable for the top 10 features
sns.barplot(
    x='Importance', 
    y='Feature', 
    data=feature_importance_df.head(10), 
    palette='viridis'
)
plt.title('Top 10 Random Forest Feature Importances')
plt.tight_layout()
plt.savefig('rf_feature_importance.png')

print("\nRandom Forest evaluation complete: Results printed and plots saved.")

# Comparison to Baseline (FIXED FOR CLARITY): 
# Assuming the Logistic Regression AUC was saved as 'lr_roc_auc' in the previous evaluation step.
try:
    print(f"\nBaseline Logistic Regression ROC AUC: {lr_roc_auc:.4f}")
except NameError:
    print("\nNOTE: The Logistic Regression AUC (lr_roc_auc) variable was not found for comparison.")
    print("Please ensure the LR evaluation step saved its ROC AUC score to a variable named 'lr_roc_auc'.")
print(f"Random Forest ROC AUC: {roc_auc_rf:.4f}")

### 9.1 Random Forest Performance on Test Set
Random Forest Test ROC AUC Score: 0.9242

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93        33
           1       0.60      0.50      0.55         6

    accuracy                           0.87        39
   macro avg       0.76      0.72      0.74        39
weighted avg       0.86      0.87      0.87        39


### 9.2 Random Forest Feature Importance

Top 10 Most Influential Features (Random Forest):
| Feature               |   Importance |
|:----------------------|-------------:|
| num__espn_favor_mag   |    0.278002  |
| num__home_dk_pct      |    0.180664  |
| num__away_dk_pct      |    0.158661  |
| num__away_spread      |    0.0932464 |
| num__home_spread      |    0.0852537 |
| cat__espn_favors_home |    0.057238  |
| cat__dk_favors_home   |    0.0522554 |
| num__away_vig         |    0.0498274 |
| num__home_vig         |    0.044852  |

Random Forest evaluation complete: Results printed and plots saved.

NOTE: The Logistic Regression AUC (lr_roc_auc) variable was not found for comparison.
Please ensure the LR evaluation step saved its ROC AUC score to a variable named 'lr_roc_auc'.
Random Forest ROC AUC: 0.9242

/var/folders/mp/y8pdqdj10j75x1283g_ysrhr0000gn/T/ipykernel_6622/3254443091.py:63: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# --- 1. Get Probability Predictions from both models ---
# LR Baseline Probabilities
Y_proba_lr = best_lr_model.predict_proba(X_test)[:, 1]
# RF Advanced Probabilities
Y_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]

# --- 2. Calculate AUC Scores ---
auc_lr = roc_auc_score(Y_test, Y_proba_lr)
auc_rf = roc_auc_score(Y_test, Y_proba_rf)

# --- 3. Calculate ROC Curve Points (TPR vs. FPR) ---
fpr_lr, tpr_lr, _ = roc_curve(Y_test, Y_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(Y_test, Y_proba_rf)

# --- 4. Plotting ---
plt.figure(figsize=(8, 8))

# Plot the Random Forest curve first (should be better)
plt.plot(fpr_rf, tpr_rf, color='darkred', lw=2,
         label=f'Random Forest (AUC = {auc_rf:.4f})')

# Plot the Logistic Regression curve
plt.plot(fpr_lr, tpr_lr, color='blue', lw=2,
         label=f'Logistic Regression (AUC = {auc_lr:.4f})')

# Plot the random chance line (diagonal)
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity/Recall)')
plt.title('ROC Curve Comparison: Baseline vs. Random Forest')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('combined_roc_curve.png')

print("Combined ROC Curve plot saved to 'combined_roc_curve.png'.")

Combined ROC Curve plot saved to 'combined_roc_curve.png'.

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

# --- 1. Get Probability Predictions (Focus on the best model) ---
Y_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]
Y_test_binary = Y_test.astype(int)

# --- 2. Calculate Precision, Recall, and Thresholds ---
precision, recall, thresholds = precision_recall_curve(Y_test_binary, Y_proba_rf)

# Calculate the Area Under the Precision-Recall Curve (AUPRC)
auprc = auc(recall, precision)

# --- 3. Plotting ---
plt.figure(figsize=(8, 6))

# Plot the Precision-Recall curve
plt.plot(recall, precision, color='darkorange', lw=2,
         label=f'Random Forest AUPRC = {auprc:.4f}')

# Plot the no-skill baseline (the fraction of positive cases in the data)
no_skill = Y_test_binary.sum() / len(Y_test_binary)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', color='gray', label='No Skill Baseline')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall (True Positive Rate)')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve (Random Forest)')
plt.legend(loc='lower left')
plt.grid(True)
plt.savefig('rf_precision_recall_curve.png')

print("Random Forest Precision-Recall Curve plot saved to 'rf_precision_recall_curve.png'.")

Random Forest Precision-Recall Curve plot saved to 'rf_precision_recall_curve.png'.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Features list 
numerical_features = [
    'away_dk_pct', 'home_dk_pct', 'espn_favor_mag', 'away_spread',
    'home_spread', 'away_vig', 'home_vig'
]
categorical_features = [
    'espn_favors', 'dk_favors'
]
all_features = numerical_features + categorical_features

#  2. MODEL DEFINITION AND TRAINING (Required for fit/transform) 

# Load training data
df = pd.read_excel('/Users/urielulloa/Desktop/Machine_Learning_Final/ML_clean_data.xlsx')
Y = df['edge_paid_out']
X = df[all_features]

# Split data (using X, Y to define the training set for the model pipeline)
X_train, _, Y_train, _ = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

# Define Preprocessor Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))]), categorical_features)
    ],
    remainder='passthrough'
)

# Define and Train Final Model (using best parameters)
rf = RandomForestClassifier(
    random_state=42, 
    class_weight='balanced',
    n_estimators=300, max_depth=5, min_samples_split=10
)
best_rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

# Fit the entire pipeline to the training data
best_rf_model.fit(X_train, Y_train)
print("Model trained and ready for forecasting.")


#  3. AUTOMATED FORECASTING STEP 

# Load the NEW game data automatically from the CSV file
X_new_raw = pd.read_excel('/Users/urielulloa/Desktop/Machine_Learning_Final/new_data.xlsx')

# AUTOMATION: Select and order ONLY the required features
X_new_automated = X_new_raw[all_features]


# Get the probability of the edge paying out (Class 1)
Y_proba_new = best_rf_model.predict_proba(X_new_automated)[:, 1][0]

# Get the predicted class (0 or 1)
Y_pred_new = best_rf_model.predict(X_new_automated)[0]


#  4. OUTPUT INSIGHTS 

print("\n-------------------------------------------------")
print(f"AUTOMATED FORECAST for game from: {new_game_file_path}")
print("-------------------------------------------------")

print(f"Input Features:\n{X_new_automated.to_string()}")
print("\n--- Model Prediction ---")
print(f"Probability of Edge Paying Out (P(Y=1)): {Y_proba_new:.4f}")

if Y_pred_new == 1:
    print("\nPredicted Class: 1 (PAID EDGE LIKELY)")
    print("INSIGHT: PocketSense recommends placing a bet on this game.")
else:
    print("\nPredicted Class: 0 (NO PAID EDGE LIKELY)")
    print("INSIGHT: App recommends avoiding this game.")

Model trained and ready for forecasting.

-------------------------------------------------
AUTOMATED FORECAST for game from: /Users/urielulloa/Desktop/Machine_Learning_Final/new_data.xlx
-------------------------------------------------
Input Features:
   away_dk_pct  home_dk_pct  espn_favor_mag  away_spread  home_spread  away_vig  home_vig espn_favors dk_favors
0     0.259128     0.740872       -0.126872          7.5         -7.5      -108      -112        away      home

--- Model Prediction ---
Probability of Edge Paying Out (P(Y=1)): 0.4060

Predicted Class: 0 (NO PAID EDGE LIKELY)
INSIGHT: App recommends avoiding this game.

MA 707 - Project Final¶

Giulio Barnuevo, Kenan Batu, Uriel Ulloa Rober DiMaggio¶

1. Data Preparation¶

Feature Selection (X)¶

2. Train-Test Split¶

3. EDA¶

4. Preprocessing and Pipelining¶

5. Model & Tuning - Linear Regression¶

6. Model Evaluation - Linear Regression¶

Interpretation: Coefficients¶

7. Random Forest Model & Tuning¶

8. Random Forest Evaluation and Interpretation¶

Evaluation: The Non-Linear Payoff¶

9. Model Comparison¶

9.1. The Combined ROC Curve¶

Insights:¶

9.2. Precision-Recall Curve (PR Curve)¶

10. Model Implementation¶