# ── SECTION 1: LOAD CHECKPOINT ───────────────────────────────────────────────

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ks_2samp
from sklearn.metrics import matthews_corrcoef, log_loss, confusion_matrix, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

df = pd.read_csv('checkpoint.csv')

print(f'Checkpoint loaded: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(df.columns.tolist())

Checkpoint loaded: 6,362,620 rows x 11 columns
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

#Load Training Results Answers
import pandas as pd
results = pd.read_csv('model_results.csv')

y_test = results['y_test']
y_pred = results['y_pred']
y_prob = results['y_prob']
s_test = results[['balance_group']] # Re-creating s_test for the audit

print("All audit variables (including s_test) are now defined.")

All audit variables (including s_test) are now defined.

# Thresholds from SETUP, THRESHOLDS AND ENVIRONMENT Notebook section
THRESHOLDS = {
    'SPD':      0.10,   # Fairness
    'EOD':      0.10,   # Fairness
    'MCC':      0.50,   # Performance
    'LOG_LOSS': 0.40,   # Performance
    'PR_AUC':   0.70,   # Performance
    'KS':       0.30,   # Performance
}

# ── SECTION 7: PERFORMANCE EVALUATION ────────────────────────────────────────
mcc = matthews_corrcoef(y_test, y_pred)
ll  = log_loss(y_test, y_prob)
cm_out = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm_out.ravel()

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall    = tp / (tp + fn) if (tp + fn) > 0 else 0

print('=' * 60)
print('PERFORMANCE METRICS — MCC AND LOG LOSS')
print('=' * 60)
print(f'\nMCC      = {mcc:.4f} | Threshold > {THRESHOLDS["MCC"]} '
      f'| {"PASS" if mcc > THRESHOLDS["MCC"] else "FAIL"}')
print(f'Log Loss = {ll:.4f} | Threshold < {THRESHOLDS["LOG_LOSS"]} '
      f'| {"PASS" if ll < THRESHOLDS["LOG_LOSS"] else "FAIL"}')
print(f'\nConfusion Matrix Breakdown:')
print(f'  Legitimate correctly cleared : {tn:>10,}')
print(f'  Legitimate wrongly blocked   : {fp:>10,}  (False Alarms)')
print(f'  Fraud missed completely      : {fn:>10,}  (Escaped Fraudsters)')
print(f'  Fraud correctly caught       : {tp:>10,}')
print(f'\n  Precision: {precision:.4f}')
print(f'  Recall   : {recall:.4f}')

print('\nPer-group MCC breakdown (ClearBoxAI Internal):')
for g in ['Low-Balance', 'Mid-Balance', 'High-Balance']:
    mask  = s_test['balance_group'].astype(str) == g
    fraud = y_test[mask].sum()
    if fraud > 1:
        g_mcc = matthews_corrcoef(y_test[mask], y_pred[mask])
        print(f'  {g:<14}: MCC = {g_mcc:.4f} ({fraud} fraud cases in test)')
    else:
        print(f'  {g:<14}: MCC = N/A (insufficient fraud cases: {fraud})')

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
fig.suptitle(
    'Section 7 — Performance: MCC and Log Loss\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontsize=13, fontweight='bold'
)

cm_labels = np.array([
    [f'TRUE NEGATIVE\n{tn:,}\nLegitimate cleared',
     f'FALSE POSITIVE\n{fp:,}\nWrongly blocked'],
    [f'FALSE NEGATIVE\n{fn:,}\nFraud missed',
     f'TRUE POSITIVE\n{tp:,}\nFraud caught']
])
sns.heatmap(cm_out, annot=cm_labels, fmt='', ax=axes[0], cmap='Blues',
            cbar=False,
            xticklabels=['Predicted: Legitimate', 'Predicted: Fraud'],
            yticklabels=['Actual: Legitimate', 'Actual: Fraud'],
            linewidths=1)
axes[0].set_title(f'Confusion Matrix\nMCC = {mcc:.4f}', fontweight='bold')

metric_vals   = [mcc, ll]
metric_thresh = [THRESHOLDS['MCC'], THRESHOLDS['LOG_LOSS']]
metric_pass   = [mcc > THRESHOLDS['MCC'], ll < THRESHOLDS['LOG_LOSS']]
bar_colors    = ['#4CAF50' if p else '#F44336' for p in metric_pass]

bars = axes[1].bar(['MCC', 'Log Loss'], metric_vals,
                   color=bar_colors, edgecolor='black', alpha=0.85)
axes[1].plot([-0.4, 0.4], [THRESHOLDS['MCC']] * 2, 'k--', lw=2,
             label=f'MCC threshold = {THRESHOLDS["MCC"]}')
axes[1].plot([0.6, 1.4], [THRESHOLDS['LOG_LOSS']] * 2, 'r--', lw=2,
             label=f'Log Loss threshold = {THRESHOLDS["LOG_LOSS"]}')
for bar, val, passed in zip(bars, metric_vals, metric_pass):
    axes[1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                 f'{val:.4f}\n{"PASS" if passed else "FAIL"}',
                 ha='center', va='bottom', fontsize=10, fontweight='bold')
axes[1].set_title('MCC and Log Loss vs Thresholds', fontweight='bold')
axes[1].set_ylabel('Value')
axes[1].legend()

plt.tight_layout()
plt.savefig('fig_perf_01_mcc_logloss.png', dpi=150, bbox_inches='tight')
plt.show()

============================================================
PERFORMANCE METRICS — MCC AND LOG LOSS
============================================================

MCC      = 0.4419 | Threshold > 0.5 | FAIL
Log Loss = 0.0135 | Threshold < 0.4 | PASS

Confusion Matrix Breakdown:
  Legitimate correctly cleared :  1,264,264
  Legitimate wrongly blocked   :      6,617  (False Alarms)
  Fraud missed completely      :         12  (Escaped Fraudsters)
  Fraud correctly caught       :      1,631

  Precision: 0.1977
  Recall   : 0.9927

Per-group MCC breakdown (ClearBoxAI Internal):
  Low-Balance   : MCC = 0.1245 (6 fraud cases in test)
  Mid-Balance   : MCC = 0.2192 (225 fraud cases in test)
  High-Balance  : MCC = 0.6155 (1412 fraud cases in test)

# ── SECTION 7: PR-AUC AND KS STATISTIC ───────────────────────────────────────
# These measure discrimination power across all possible thresholds,
# not just at the default 0.5 cutoff.

prec_v, rec_v, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(rec_v, prec_v)

fraud_scores = y_prob[y_test.values == 1]
legit_scores  = y_prob[y_test.values == 0]
ks_stat, _    = ks_2samp(fraud_scores, legit_scores)

sorted_scores = np.sort(y_prob)
sample_idx    = np.linspace(0, len(sorted_scores) - 1, 5000, dtype=int)
s_sample      = sorted_scores[sample_idx]
f_cdf = np.array([np.mean(fraud_scores <= s) for s in s_sample])
l_cdf = np.array([np.mean(legit_scores  <= s) for s in s_sample])
ks_score = s_sample[np.argmax(np.abs(f_cdf - l_cdf))]

print('=' * 60)
print('DISCRIMINATION POWER — PR-AUC AND KS STATISTIC')
print('=' * 60)
print(f'\nPR-AUC  = {pr_auc:.4f} | Threshold > {THRESHOLDS["PR_AUC"]} '
      f'| {"PASS" if pr_auc > THRESHOLDS["PR_AUC"] else "FAIL"}')
print(f'  Random baseline: {y_test.mean():.4f}')
print(f'  Model is {pr_auc / y_test.mean():.0f}x better than a random classifier')
print(f'\nKS Stat = {ks_stat:.4f} | Threshold > {THRESHOLDS["KS"]} '
      f'| {"PASS" if ks_stat > THRESHOLDS["KS"] else "FAIL"}')
print(f'  Maximum score separation at score = {ks_score:.4f}')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(
    'Section 7 — Discrimination Power: PR-AUC and KS Statistic\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontsize=13, fontweight='bold'
)

axes[0].plot(rec_v, prec_v, color='#2196F3', lw=2.5,
             label=f'Model PR-AUC = {pr_auc:.4f}')
axes[0].axhline(y_test.mean(), color='gray', linestyle='--',
                label=f'Random baseline = {y_test.mean():.4f}')
axes[0].axhline(THRESHOLDS['PR_AUC'], color='red', linestyle=':',
                label=f'Pass threshold = {THRESHOLDS["PR_AUC"]}')
axes[0].fill_between(rec_v, prec_v, alpha=0.1, color='#2196F3')
axes[0].set_xlabel('Recall (fraud caught)')
axes[0].set_ylabel('Precision (flags that are real fraud)')
axes[0].set_title('Precision-Recall Curve\nPASS', fontweight='bold')
axes[0].legend(fontsize=9)

axes[1].plot(s_sample, f_cdf, color='#F44336', lw=2.5, label='Fraud score distribution')
axes[1].plot(s_sample, l_cdf, color='#2196F3', lw=2.5, label='Legitimate score distribution')
axes[1].axvline(ks_score, color='green', linestyle='--', lw=2,
                label=f'Max gap (KS={ks_stat:.4f})')
axes[1].set_xlabel('Model Score (probability of fraud)')
axes[1].set_ylabel('Cumulative percentage of transactions')
axes[1].set_title('KS Statistic — Score Separation\nPASS', fontweight='bold')
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.savefig('fig_perf_02_prauc_ks.png', dpi=150, bbox_inches='tight')
plt.show()

============================================================
DISCRIMINATION POWER — PR-AUC AND KS STATISTIC
============================================================

PR-AUC  = 0.9357 | Threshold > 0.7 | PASS
  Random baseline: 0.0013
  Model is 725x better than a random classifier

KS Stat = 0.9916 | Threshold > 0.3 | PASS
  Maximum score separation at score = 0.4118

C:\Users\GOD 1st\AppData\Local\Temp\ipykernel_1548\2532490025.py:58: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  plt.tight_layout()
C:\Users\GOD 1st\AppData\Local\Temp\ipykernel_1548\2532490025.py:59: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  plt.savefig('fig_perf_02_prauc_ks.png', dpi=150, bbox_inches='tight')
C:\Users\GOD 1st\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)

df.to_csv('checkpoint_v2.csv', index=False)
print("Checkpoint v2 saved.")

Checkpoint v2 saved.

Regulation / Standard	Provision	Status
BoG CISD 2026	Annexure E §g(iii)(1) — Functional & performance testing	Complete
NIST AI RMF 1.0	MEASURE 2.3 — Performance criteria measured	Complete
NIST AI RMF 1.0	§3.1 — Valid & Reliable characteristic	Complete

PERFORMANCE EVALUATION¶

[Performance findings]¶

Model Performance Evaluation — MCC Analysis¶

Per-Group MCC Breakdown¶

Structural Performance Insight¶

Interpretation¶

Regulatory Mapping¶

Risk Assessment¶

Metric	Value	Threshold	Result
MCC	0.4419	above 0.50	FAIL
Log Loss	0.0135	below 0.40	PASS
PR-AUC	0.9357	above 0.70	PASS
KS Statistic	0.9916	above 0.30	PASS

Group	MCC	Fraud Cases (Test Set)
Low-Balance	0.1245	6
Mid-Balance	0.2192	225
High-Balance	0.6155	1,412