# ── SECTION 1: LOAD CHECKPOINT ───────────────────────────────────────────────

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ks_2samp
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from sklearn.metrics import matthews_corrcoef, log_loss, confusion_matrix, precision_recall_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

df = pd.read_csv('checkpoint.csv')

print(f'Checkpoint loaded: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(df.columns.tolist())

Checkpoint loaded: 6,362,620 rows x 11 columns
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

import pandas as pd

results = pd.read_csv('model_results.csv')

y_test = results['y_test']
y_pred = results['y_pred']
y_prob = results['y_prob']


s_test = results[['balance_group', 'tx_type_group']].astype(str)

print("All audit variables (including s_test) are now defined.")

All audit variables (including s_test) are now defined.

fairness_results = {}

for proxy in ['tx_type_group', 'balance_group']:
    sf     = s_test[proxy].astype(str).values
    groups = np.unique(sf)

    fairness_results[proxy] = {'group_stats': {}}

    for g in groups:
        mask = sf == g

        y_true_g = y_test[mask]
        y_pred_g = y_pred[mask]

        # confusion matrix: tn, fp, fn, tp
        tn, fp, fn, tp = confusion_matrix(y_true_g, y_pred_g, labels=[0,1]).ravel()

        fairness_results[proxy]['group_stats'][g] = {
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp
        }

# ── SECTION 9: ERROR ANALYSIS ─────────────────────────────────────────────────
print('=' * 65)
print('ERROR ANALYSIS — WHERE DOES THE MODEL FAIL?')
print('=' * 65)

for proxy in ['tx_type_group', 'balance_group']:
    sf     = s_test[proxy].astype(str).values
    groups = sorted(np.unique(sf))

    if proxy == 'balance_group':
        order  = ['Low-Balance', 'Mid-Balance', 'High-Balance']
        groups = [g for g in order if g in groups]

    print(f'\nProxy: {proxy}')
    print(f'{"Group":<14} {"False Positives":>16} {"FP Rate":>10} '
          f'{"False Negatives":>16} {"FN Rate":>10} {"Miss Rate":>10}')
    print('-' * 82)

    for g in groups:
        stats_g = fairness_results[proxy]['group_stats'][g]
        n_legit = stats_g['tn'] + stats_g['fp']
        n_fraud = stats_g['tp'] + stats_g['fn']
        fp_rate = stats_g['fp'] / n_legit if n_legit > 0 else 0
        fn_rate = stats_g['fn'] / n_fraud if n_fraud > 0 else 0
        miss_pct = stats_g['fn'] / n_fraud * 100 if n_fraud > 0 else 0

        print(f'{g:<14} {stats_g["fp"]:>16,} {fp_rate:>10.6f} '
              f'{stats_g["fn"]:>16,} {fn_rate:>10.6f} {miss_pct:>9.1f}%')

print('\nKey question: Are false positives and false negatives')
print('concentrated in specific groups, or spread evenly?')

=================================================================
ERROR ANALYSIS — WHERE DOES THE MODEL FAIL?
=================================================================

Proxy: tx_type_group
Group           False Positives    FP Rate  False Negatives    FN Rate  Miss Rate
----------------------------------------------------------------------------------
CASH_OUT                  2,866   0.006420                8   0.010025       1.0%
OTHER                     3,751   0.004550                4   0.004734       0.5%

Proxy: balance_group
Group           False Positives    FP Rate  False Negatives    FN Rate  Miss Rate
----------------------------------------------------------------------------------
Low-Balance                 168   0.000399                2   0.333333      33.3%
Mid-Balance               4,171   0.009958                6   0.026667       2.7%
High-Balance              2,278   0.005279                4   0.002833       0.3%

Key question: Are false positives and false negatives
concentrated in specific groups, or spread evenly?

# ── SECTION 9: ERROR DISTRIBUTION CHARTS ─────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(
    'Section 9 — Error Analysis: Who Bears the Cost of Model Mistakes?\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontsize=12, fontweight='bold'
)

bg_groups = ['Low-Balance', 'Mid-Balance', 'High-Balance']
bg_stats  = fairness_results['balance_group']['group_stats']
existing  = [g for g in bg_groups if g in bg_stats]

fp_counts = [bg_stats[g]['fp'] for g in existing]
fn_counts = [bg_stats[g]['fn'] for g in existing]
colors_bg = ['#FF6B6B', '#FFC107', '#4CAF50'][:len(existing)]

# False positives (innocent people wrongly blocked)
axes[0].bar(existing, fp_counts, color=colors_bg, edgecolor='black', alpha=0.85)
axes[0].set_title('False Positives by Balance Tier\n(Legitimate customers wrongly blocked)',
                  fontweight='bold')
axes[0].set_ylabel('Number of false positives')
for i, v in enumerate(fp_counts):
    axes[0].text(i, v + 5, f'{v:,}', ha='center', fontsize=10, fontweight='bold')

# False negatives (fraud missed)
axes[1].bar(existing, fn_counts, color=colors_bg, edgecolor='black', alpha=0.85)
axes[1].set_title('False Negatives by Balance Tier\n(Fraud missed — victims unprotected)',
                  fontweight='bold')
axes[1].set_ylabel('Number of false negatives')
for i, v in enumerate(fn_counts):
    pct_of_all = v / sum(fn_counts) * 100 if sum(fn_counts) > 0 else 0
    axes[1].text(i, v + 0.1,
                 f'{v}\n({pct_of_all:.1f}% of all\nmissed fraud)',
                 ha='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('fig_error_01.png', dpi=150, bbox_inches='tight')
plt.show()

df.to_csv('checkpoint_v2.csv', index=False)
print("Checkpoint v2 saved.")

Checkpoint v2 saved.

Regulation	Provision	Status
BoG CISD 2026, Annexure E §l(i)	Material bias confirmed through error distribution analysis	TRIGGERED
NIST AI RMF 1.0, MEASURE 2.11	Fairness and bias evaluated and documented	Complete
NIST AI RMF 1.0, §3.7	Harmful Bias Managed — trustworthiness characteristic	Breached

ERROR ANALYSIS¶

Error Analysis Findings¶

False Positives - Innocent Customers Wrongly Blocked¶

False Negatives - Fraud Missed, Victims Unprotected¶

What This Means Practically¶

Conclusion¶

Regulatory Assessment¶