import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import numpy as np  
from sklearn.metrics import roc_curve

# Load the training data from the uploaded file

df_train = pd.read_csv("bank_train.csv")
print(f"Training data loaded. Shape: {df_train.shape}")


# Define Feature types based on the Data Dictionary
quant_vars = ['age', 'balance']
qual_vars = ['job', 'marital', 'education', 'default', 'home_loan', 'personal_loan', 'contact']

# Drop customer_id as it is an index, not a feature
df_train = df_train.drop('customer_id', axis=1)

# Define X and y
X = df_train[quant_vars + qual_vars]
y = df_train['deposit']

# Convert target 'deposit' to numeric (0 and 1) for classification: 'yes' -> 1, 'no' -> 0
y = y.map({'yes': 1, 'no': 0})
print(f"Target variable distribution (1=Deposit): \n{y.value_counts(normalize=True)}")

Training data loaded. Shape: (10000, 11)
Target variable distribution (1=Deposit): 
deposit
0    0.5248
1    0.4752
Name: proportion, dtype: float64

# Split the data into 80% training and 20% testing
# Keep stratify=y to maintain the original class distribution in both splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f'\nTrain shape: {X_train.shape}')
print(f'Test shape: {X_test.shape}')

Train shape: (8000, 9)
Test shape: (2000, 9)

# Use a cross-tabulation to see the conversion rate (deposit) by job type
job_deposit_crosstab = pd.crosstab(df_train['job'], y, normalize='index')

print("Conversion Rate by Job Type:")
print(job_deposit_crosstab.sort_values(by=1, ascending=False).round(4)*100)

# Visualize the conversion rate
plt.figure(figsize=(10, 6))
job_deposit_crosstab[1].sort_values().plot(kind='barh', color='skyblue')
plt.title('Deposit Conversion Rate by Job Type')
plt.xlabel('Probability of Deposit Done subject to job')
plt.ylabel('Job Type')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

Conversion Rate by Job Type:
deposit            0      1
job                        
student        24.92  75.08
retired        33.43  66.57
unemployed     42.50  57.50
management     49.07  50.93
unknown        51.56  48.44
admin.         52.97  47.03
self-employed  53.71  46.29
technician     54.02  45.98
housemaid      59.67  40.33
services       60.24  39.76
entrepreneur   61.86  38.14
blue-collar    63.16  36.84

# Create a temporary dataframe for plotting
plot_df = X_train.copy()
plot_df['deposit'] = y_train

plt.figure(figsize=(12, 5))

# 1. Box Plot for Age vs. Deposit Status
plt.subplot(1, 2, 1)
sns.boxplot(x='deposit', y='age', data=plot_df)
plt.title('Age Distribution by Deposit Status (0=No, 1=Yes)')
plt.xlabel('Deposit Status')
plt.ylabel('Age')

# 2. Box Plot for Balance vs. Deposit Status
# Use a logarithmic scale for balance due to high skew/outliers
plt.subplot(1, 2, 2)
sns.boxplot(x='deposit', y=np.log1p(plot_df['balance']), data=plot_df)
plt.title('Log(Balance) Distribution by Deposit Status')
plt.xlabel('Deposit Status')
plt.ylabel('Log(Balance + 1)')

plt.tight_layout()
plt.show()

/Applications/anaconda3/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log1p
  result = getattr(ufunc, method)(*inputs, **kwargs)
/Applications/anaconda3/lib/python3.12/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log1p
  result = getattr(ufunc, method)(*inputs, **kwargs)

deposit_counts = y.value_counts()
deposit_proportions = y.value_counts(normalize=True).round(4) * 100

plt.figure(figsize=(6, 4))
deposit_counts.plot(kind='bar', color=['lightcoral', 'mediumseagreen'])
plt.title('Distribution of Target Variable (Deposit)')
plt.xlabel('Deposit Status (0: No, 1: Yes)')
plt.ylabel('Number of Customers')
plt.xticks(ticks=[0, 1], labels=['No Deposit (0)', 'Yes Deposit (1)'], rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 1. Quantitative Pipeline: Scaling
pre_quant = Pipeline([
    ('scaler', StandardScaler())
])

# 2. Qualitative Pipeline: One-Hot Encoding
pre_qual = Pipeline([
    ('one_hot_enc', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('quant', pre_quant, quant_vars),
        ('qual', pre_qual, qual_vars)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

preprocess.set_output(transform='pandas')

print("Preprocessing ColumnTransformer defined and ready.")

Preprocessing ColumnTransformer defined and ready.

# Full pipeline: Preprocessing -> Model
full_pipe = Pipeline([
    ('preprocess', preprocess),
    ('log_reg', LogisticRegression(solver='saga', random_state=42, max_iter=5000))
])

print("Full Classification Pipeline defined.")

Full Classification Pipeline defined.

param_grid = [
    {
        'log_reg__penalty': ['l1', 'l2'],
        'log_reg__C': np.logspace(-2, 1, 10)  
    }
]

# Use ROC AUC (Area Under the Curve) as the primary scoring metric
grid_search = GridSearchCV(
    full_pipe,
    param_grid,
    cv=5,                 
    scoring='accuracy',    
    refit=True            
)

grid_size = len(param_grid[0]['log_reg__penalty']) * len(param_grid[0]['log_reg__C'])
print(f"Grid Search initialized, testing {grid_size} combinations.")

Grid Search initialized, testing 20 combinations.

grid_search.fit(X_train, y_train)

print('\nThese are the best Hyperparameters:')
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

These are the best Hyperparameters:
{'log_reg__C': 0.21544346900318834, 'log_reg__penalty': 'l1'}

y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# Calculate key metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1_scr = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)

print(f'\nTest Set Evaluation')
print(f'Test ROC AUC (Best Model): {roc_auc:.4f}')
print(f'Test F1 Score (Macro):     {f1_scr:.4f}')
print(f'Confusion Matrix:\n{cm}')

Test Set Evaluation
Test ROC AUC (Best Model): 0.6837
Test F1 Score (Macro):     0.6205
Confusion Matrix:
[[657 393]
 [365 585]]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier (AUC = 0.50)')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

df_new = pd.read_csv("bank_new.csv")
print(f"\nNew data loaded. Shape: {df_new.shape}")


new_customer_ids = df_new['customer_id']
X_new = df_new.drop('customer_id', axis=1)

# Predict probabilities and final class labels
new_pred_proba = best_model.predict_proba(X_new)[:, 1]
new_predictions = best_model.predict(X_new)

# Map the numeric predictions back to 'yes'/'no'
prediction_labels = pd.Series(new_predictions).map({1: 'yes', 0: 'no'})

# Create the final results DataFrame
results_df = pd.DataFrame({
    'customer_id': new_customer_ids,
    'deposit_prediction': prediction_labels,
    'deposit_probability': new_pred_proba
})

print("\n--- Final Predictions on bank_new.csv ---")
print(results_df.head(10))

results_df.to_csv('bank_new_predictions.csv', index=False)

New data loaded. Shape: (1162, 10)

--- Final Predictions on bank_new.csv ---
   customer_id deposit_prediction  deposit_probability
0        10001                 no             0.489080
1        10002                 no             0.395572
2        10003                yes             0.540228
3        10004                 no             0.358699
4        10005                yes             0.563160
5        10006                yes             0.607738
6        10007                yes             0.784516
7        10008                 no             0.235639
8        10009                yes             0.532671
9        10010                 no             0.369695

import pandas as pd
import numpy as np
import os # Necessary for path confirmation

# --- Prerequisites ---
# ASSUMPTION: The 'best_model' variable, containing the fitted
# LogisticRegression Pipeline from the GridSearchCV step, is available
# in the notebook environment.

# 1. Load the new, unseen data (bank_new.csv)
try:
    df_new = pd.read_csv("bank_new.csv")
    print(f"Loaded new dataset with {len(df_new)} customers for prediction.")
except FileNotFoundError:
    print("Error: 'bank_new.csv' not found. Please check the file path.")
    raise

# 2. Separate customer IDs and features (X_new)
customer_ids = df_new['customer_id']
# We drop the ID column as it's not a feature for the model
X_new = df_new.drop('customer_id', axis=1)


# a. Predicted probabilities (for detailed lead ranking)
# We take the probability of the positive class (1, which is 'yes' deposit)
y_proba_new = best_model.predict_proba(X_new)[:, 1]

# b. Hard classifications (0 or 1, based on the optimal threshold found during training)
y_pred_new = best_model.predict(X_new)

# 4. Map the binary predictions back to string labels ('yes'/'no')
deposit_predictions = pd.Series(y_pred_new).map({1: 'yes', 0: 'no'})

# 5. Create the submission DataFrame
df_submit = pd.DataFrame({
    'customer_id': customer_ids,
    'deposit_prediction': deposit_predictions, # The required hard classification
    'deposit_probability': y_proba_new.round(4) # Useful for bank team to set custom thresholds
})

# 6. Save the submission file
submission_filepath = "bank_submit.csv"
df_submit.to_csv(submission_filepath, index=False)

print(f"\n--- Submission Complete ---")
print(f"Successfully generated {submission_filepath} for {len(df_submit)} new customers.")
print(f"Preview of the submission file (first 5 rows):")
print(df_submit.head())

Loaded new dataset with 1162 customers for prediction.

--- Submission Complete ---
Successfully generated bank_submit.csv for 1162 new customers.
Preview of the submission file (first 5 rows):
   customer_id deposit_prediction  deposit_probability
0        10001                 no               0.4891
1        10002                 no               0.3956
2        10003                yes               0.5402
3        10004                 no               0.3587
4        10005                yes               0.5632

MA 707 - Problem 1 set¶

Bank Marketing¶

1. Data Preparation and Splitting¶

1.2. Train-Test Split¶

1.3. Data Exploratory Analysis¶

Quantitative Feature Analysis:¶

Target Variable.¶

2. Preprocessing Pipeline¶

2.1. Define feature pipeline¶

3. Model Pipeline and Hyperparameter Grid¶

3.1 Full Pipeline and Model Definition¶

3.2 Hyperparameter Grid for Regularization¶

4. Hyperparameter Tuning and Model Training¶

5. Model Evaluation¶

5.1. ROC Curve¶

6. Prediction on New Data¶

Final Thoughts¶

Adding predictions to the Bank_Submit.csv file¶