# ── SECTION 1: LOAD CHECKPOINT ───────────────────────────────────────────────

import pandas as pd
import joblib
import shap
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ks_2samp
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import matthews_corrcoef, log_loss, confusion_matrix, precision_recall_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

df = pd.read_csv('checkpoint.csv')

print(f'Checkpoint loaded: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(df.columns.tolist())

Checkpoint loaded: 6,362,620 rows x 11 columns
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

model = joblib.load("xgb_model.pkl")

X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')
results = pd.read_csv('model_results.csv')

y_pred = results['y_pred']
y_prob = results['y_prob']

s_test = results[['balance_group', 'tx_type_group']].astype(str)

print("Model + data loaded")
print(X_test.shape)

Model + data loaded
(1272524, 6)

FEATURES = [
    'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest'
]

# ── SECTION 11: THRESHOLD SENSITIVITY ─────────────────────────────────────────
# The default threshold for classification is 0.5.
# A bank might want to be more aggressive (lower threshold = catch more fraud, more false alarms)
# or more conservative (higher threshold = fewer false alarms, more fraud missed).
# We test a range of thresholds and show how key metrics change.

print('Threshold Sensitivity Analysis')
print('Showing how performance changes at different decision thresholds.\n')

thresholds_to_test = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results_thresh = []

for t in thresholds_to_test:
    # Convert probabilities into binary predictions at threshold t
    y_pred_t = (y_prob >= t).astype(int)

    # Skip degenerate cases (all 0 or all 1 predictions)
    if y_pred_t.sum() == 0:
        continue

    # Global performance metrics
    tn_t, fp_t, fn_t, tp_t = confusion_matrix(y_test, y_pred_t).ravel()

    prec_t = tp_t / (tp_t + fp_t) if (tp_t + fp_t) > 0 else 0
    rec_t  = tp_t / (tp_t + fn_t) if (tp_t + fn_t) > 0 else 0
    mcc_t  = matthews_corrcoef(y_test, y_pred_t)

    # ── Low-Balance fairness metric (TPR) ────────────────────────────────
    lb_mask = s_test['balance_group'].astype(str) == 'Low-Balance'

    lb_yt = y_test[lb_mask].to_numpy().flatten()
    lb_yp = y_pred_t[lb_mask].to_numpy().flatten()

    lb_tp = int(((lb_yt == 1) & (lb_yp == 1)).sum())
    lb_fn = int(((lb_yt == 1) & (lb_yp == 0)).sum())

    lb_tpr = lb_tp / (lb_tp + lb_fn) if (lb_tp + lb_fn) > 0 else 0

    # Store results
    results_thresh.append({
        'threshold': t,
        'precision': prec_t,
        'recall': rec_t,
        'mcc': mcc_t,
        'fp': fp_t,
        'fn': fn_t,
        'lb_tpr': lb_tpr
    })

# Convert results to DataFrame
df_thresh = pd.DataFrame(results_thresh)

# Display results
print(f'{"Threshold":>10} {"Precision":>10} {"Recall":>8} {"MCC":>8} '
      f'{"FP":>8} {"FN":>8} {"LB TPR":>10}')
print('-' * 70)

for _, row in df_thresh.iterrows():
    print(f'{row["threshold"]:>10.1f} {row["precision"]:>10.4f} '
          f'{row["recall"]:>8.4f} {row["mcc"]:>8.4f} '
          f'{int(row["fp"]):>8,} {int(row["fn"]):>8,} {row["lb_tpr"]:>10.4f}')

print('\nLB TPR = True Positive Rate for Low-Balance users at each threshold.')
print('Note how lowering the threshold improves Low-Balance protection')
print('and what cost it introduces in false positives.')

Threshold Sensitivity Analysis
Showing how performance changes at different decision thresholds.

 Threshold  Precision   Recall      MCC       FP       FN     LB TPR
----------------------------------------------------------------------
       0.1     0.0843   0.9988   0.2881   17,828        2     0.8333
       0.2     0.1159   0.9976   0.3383   12,506        4     0.6667
       0.3     0.1458   0.9976   0.3799    9,606        4     0.6667
       0.4     0.1714   0.9976   0.4122    7,923        4     0.6667
       0.5     0.1977   0.9927   0.4419    6,617       12     0.6667
       0.6     0.2286   0.9903   0.4748    5,489       16     0.6667
       0.7     0.2716   0.9878   0.5170    4,353       20     0.5000
       0.8     0.3362   0.9830   0.5741    3,189       28     0.5000
       0.9     0.4557   0.9696   0.6641    1,903       50     0.1667

LB TPR = True Positive Rate for Low-Balance users at each threshold.
Note how lowering the threshold improves Low-Balance protection
and what cost it introduces in false positives.

# ── SECTION 11: THRESHOLD CHART ───────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(
    'Section 11 — Threshold Sensitivity Analysis\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontsize=12, fontweight='bold'
)

axes[0].plot(df_thresh['threshold'], df_thresh['precision'],
             color='#2196F3', lw=2, label='Precision')
axes[0].plot(df_thresh['threshold'], df_thresh['recall'],
             color='#F44336', lw=2, label='Recall')
axes[0].plot(df_thresh['threshold'], df_thresh['mcc'],
             color='#4CAF50', lw=2, label='MCC')
axes[0].set_xlabel('Decision Threshold')
axes[0].set_ylabel('Metric Value')
axes[0].set_title('Precision, Recall, and MCC vs Threshold', fontweight='bold')
axes[0].legend()
axes[0].axvline(0.5, color='gray', linestyle='--', lw=1, label='Default (0.5)')

axes[1].plot(df_thresh['threshold'], df_thresh['lb_tpr'],
             color='#FF6B6B', lw=2.5, marker='o', label='Low-Balance TPR')
axes[1].axhline(0, color='gray', linestyle='--', lw=1)
axes[1].set_xlabel('Decision Threshold')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Low-Balance Fraud Detection Rate vs Threshold\n'
                  'Does lowering the threshold help Low-Balance users?',
                  fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.savefig('fig_deploy_01_threshold.png', dpi=150, bbox_inches='tight')
plt.show()

df.to_csv('checkpoint_v2.csv', index=False)
print("Checkpoint v2 saved.")

Checkpoint v2 saved.

Requirement	BoG CISD Reference	Current Status
Material bias finding addressed	Annexure E §l(i)	NOT MET — EOD = 0.3305 (three times over the 0.10 threshold)
BoG notification submitted	§115(2)(b)	REQUIRED
Low-Balance fraud coverage remediated	Annexure E §g(iii)(3)	NOT MET
Model card completed with bias documentation	Annexure E §j(ii)	IN PROGRESS
Human oversight mechanism defined	§100 Annexure E §k	NOT MET
Kill-switch capability implemented	Annexure E §k(ii)	NOT MET
Fairness monitoring plan established	Annexure E §l(i)	NOT MET
Dispute resolution mechanism for AI decisions	Annexure E §l(ii)	NOT MET

DEPLOYMENT READINESS ASSESSMENT¶

[ Deployment readiness findings]¶