import pandas as pd
import numpy as np
import matplotlib.patches as mpatches
from sklearn.metrics import matthews_corrcoef, log_loss, precision_recall_curve, auc
from scipy.stats import ks_2samp
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

THRESHOLDS = {
    'SPD': 0.10, 'EOD': 0.10, 'MCC': 0.50,
    'LOG_LOSS': 0.40, 'PR_AUC': 0.70, 'KS': 0.30,
}

results = pd.read_csv('model_results.csv')
y_test = results['y_test']
y_pred = results['y_pred']
y_prob = results['y_prob']
s_test = results[['balance_group', 'tx_type_group']].astype(str)

# Recompute metrics
from sklearn.metrics import confusion_matrix
mcc = matthews_corrcoef(y_test, y_pred)
ll  = log_loss(y_test, y_prob)
prec_v, rec_v, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(rec_v, prec_v)
fraud_scores = y_prob[y_test.values == 1]
legit_scores  = y_prob[y_test.values == 0]
from scipy.stats import ks_2samp
ks_stat, _ = ks_2samp(fraud_scores, legit_scores)

# Recompute fairness
fairness_results = {}
for proxy in ['tx_type_group', 'balance_group']:
    sf = s_test[proxy].values
    spd = demographic_parity_difference(y_test, y_pred, sensitive_features=sf)
    eod = equalized_odds_difference(y_test, y_pred, sensitive_features=sf)
    fairness_results[proxy] = {
        'spd': spd, 'eod': eod,
        'spd_pass': abs(spd) < THRESHOLDS['SPD'],
        'eod_pass': abs(eod) < THRESHOLDS['EOD']
    }

print("All variables loaded and recomputed.")

All variables loaded and recomputed.

fairness_results = {}
for proxy in ['tx_type_group', 'balance_group']:
    sf = s_test[proxy].values
    spd = demographic_parity_difference(y_test, y_pred, sensitive_features=sf)
    eod = equalized_odds_difference(y_test, y_pred, sensitive_features=sf)
    
    groups = sorted(np.unique(sf))
    if proxy == 'balance_group':
        order  = ['Low-Balance', 'Mid-Balance', 'High-Balance']
        groups = [g for g in order if g in groups]
    
    group_stats = {}
    for g in groups:
        mask = sf == g
        yt_g = y_test.values[mask]
        yp_g = y_pred[mask]
        tp = ((yt_g == 1) & (yp_g == 1)).sum()
        fn = ((yt_g == 1) & (yp_g == 0)).sum()
        fp = ((yt_g == 0) & (yp_g == 1)).sum()
        tn = ((yt_g == 0) & (yp_g == 0)).sum()
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        group_stats[g] = {
            'tp': tp, 'fn': fn, 'fp': fp, 'tn': tn, 'tpr': tpr,
            'pred_rate': yp_g.mean()
        }
    
    fairness_results[proxy] = {
        'spd': spd, 'eod': eod,
        'spd_pass': abs(spd) < THRESHOLDS['SPD'],
        'eod_pass': abs(eod) < THRESHOLDS['EOD'],
        'group_stats': group_stats
    }

print("Fairness results with group_stats loaded.")

Fairness results with group_stats loaded.

# ── SECTION 13: FINAL SCORECARD ───────────────────────────────────────────────
scorecard = [
    ('MCC',           mcc,     THRESHOLDS['MCC'],
     mcc > THRESHOLDS['MCC'],                          'Performance'),
    ('Log Loss',      ll,      THRESHOLDS['LOG_LOSS'],
     ll < THRESHOLDS['LOG_LOSS'],                       'Performance'),
    ('PR-AUC',        pr_auc,  THRESHOLDS['PR_AUC'],
     pr_auc > THRESHOLDS['PR_AUC'],                    'Performance'),
    ('KS Statistic',  ks_stat, THRESHOLDS['KS'],
     ks_stat > THRESHOLDS['KS'],                       'Performance'),
    ('SPD (tx_type)', abs(fairness_results['tx_type_group']['spd']),
     THRESHOLDS['SPD'], fairness_results['tx_type_group']['spd_pass'], 'Fairness'),
    ('EOD (tx_type)', abs(fairness_results['tx_type_group']['eod']),
     THRESHOLDS['EOD'], fairness_results['tx_type_group']['eod_pass'], 'Fairness'),
    ('SPD (balance)', abs(fairness_results['balance_group']['spd']),
     THRESHOLDS['SPD'], fairness_results['balance_group']['spd_pass'], 'Fairness'),
    ('EOD (balance)', abs(fairness_results['balance_group']['eod']),
     THRESHOLDS['EOD'], fairness_results['balance_group']['eod_pass'], 'Fairness'),
]

passes = sum(1 for *_, p, _ in scorecard if p)
fails  = len(scorecard) - passes

print('=' * 68)
print('  ClearBoxAI AUDIT REPORT CBA-2026-002 — FINAL SCORECARD')
print('=' * 68)
print(f'  Auditor : Kwadwo Amponsah | ClearBoxAI, Ghana')
print(f'  Date    : April 2026')
print(f'  Dataset : PaySim Mobile Money Simulation (6.3M transactions)')
print('=' * 68)
print(f'\n  {"Metric":<22} {"Value":>9} {"Threshold":>12} {"Category":<14} Result')
print('  ' + '-' * 66)

for name, val, thresh, passed, category in scorecard:
    result = 'PASS' if passed else 'FAIL'
    print(f'  {name:<22} {val:>9.4f} {thresh:>12}   {category:<14} {result}')

print(f'\n  SUMMARY: {passes}/{len(scorecard)} metrics PASS | {fails}/{len(scorecard)} FAIL')
print('\n' + '=' * 68)
print('  AUDIT VERDICT')
print('=' * 68)
print("""
  DO NOT DEPLOY IN CURRENT FORM — MATERIAL BIAS DETECTED

  Performance metrics are mixed. Three of four pass.
  MCC = 0.4419 FAIL (high false positive rate requires threshold calibration).
  PR-AUC = 0.9357 PASS. KS = 0.9916 PASS. Log Loss = 0.0135 PASS.

  The critical failure is structural fairness:
  Low-Balance users receive significantly weaker fraud protection.

  TPR (Low-Balance)  = 0.6667  (one in three fraud cases missed)
  TPR (High-Balance) = 0.9972  (nearly perfect protection)
  EOD                = 0.3305  (threshold is 0.10 — more than three times over)

  Miss rate disparity: 33.3% for Low-Balance vs 0.3% for High-Balance.
  That is a 110-fold difference in how reliably fraud is caught
  between the most and least wealthy user groups.

  Per Bank of Ghana CISD 2026, Annexure E section l(i):
  Material bias must trigger mandatory model suspension
  until effective mitigation is applied.

  Per BoG CISD 2026 section 115(2)(b):
  BoG notification is required.
""")
print('=' * 68)

====================================================================
  ClearBoxAI AUDIT REPORT CBA-2026-002 — FINAL SCORECARD
====================================================================
  Auditor : Kwadwo Amponsah | ClearBoxAI, Ghana
  Date    : April 2026
  Dataset : PaySim Mobile Money Simulation (6.3M transactions)
====================================================================

  Metric                     Value    Threshold Category       Result
  ------------------------------------------------------------------
  MCC                       0.4419          0.5   Performance    FAIL
  Log Loss                  0.0135          0.4   Performance    PASS
  PR-AUC                    0.9357          0.7   Performance    PASS
  KS Statistic              0.9916          0.3   Performance    PASS
  SPD (tx_type)             0.0026          0.1   Fairness       PASS
  EOD (tx_type)             0.0053          0.1   Fairness       PASS
  SPD (balance)             0.0101          0.1   Fairness       PASS
  EOD (balance)             0.3305          0.1   Fairness       FAIL

  SUMMARY: 6/8 metrics PASS | 2/8 FAIL

====================================================================
  AUDIT VERDICT
====================================================================

  DO NOT DEPLOY IN CURRENT FORM — MATERIAL BIAS DETECTED

  Performance metrics are mixed. Three of four pass.
  MCC = 0.4419 FAIL (high false positive rate requires threshold calibration).
  PR-AUC = 0.9357 PASS. KS = 0.9916 PASS. Log Loss = 0.0135 PASS.

  The critical failure is structural fairness:
  Low-Balance users receive significantly weaker fraud protection.

  TPR (Low-Balance)  = 0.6667  (one in three fraud cases missed)
  TPR (High-Balance) = 0.9972  (nearly perfect protection)
  EOD                = 0.3305  (threshold is 0.10 — more than three times over)

  Miss rate disparity: 33.3% for Low-Balance vs 0.3% for High-Balance.
  That is a 110-fold difference in how reliably fraud is caught
  between the most and least wealthy user groups.

  Per Bank of Ghana CISD 2026, Annexure E section l(i):
  Material bias must trigger mandatory model suspension
  until effective mitigation is applied.

  Per BoG CISD 2026 section 115(2)(b):
  BoG notification is required.

====================================================================

# ── SECTION 13: FINAL SCORECARD VISUALIZATION ─────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle(
    'Section 13 — Final Audit Scorecard\n'
    'ClearBoxAI Audit CBA-2026-002 | PaySim Mobile Money Fraud Detection',
    fontsize=13, fontweight='bold'
)

names  = [m for m, *_ in scorecard]
vals   = [abs(v) for _, v, *_ in scorecard]
threshs= [t for _, _, t, *_ in scorecard]
passed = [p for *_, p, _ in scorecard]

bar_cols = ['#4CAF50' if p else '#F44336' for p in passed]
bars = axes[0].bar(range(len(names)), vals,
                   color=bar_cols, edgecolor='black', alpha=0.85)
for i, t in enumerate(threshs):
    axes[0].plot([i - 0.4, i + 0.4], [t, t], 'k--', lw=1.5)
axes[0].set_xticks(range(len(names)))
axes[0].set_xticklabels(names, rotation=45, ha='right', fontsize=9)
axes[0].set_ylabel('Metric Value (absolute)')
axes[0].set_title('All Metrics vs Pass Thresholds\nGreen = PASS | Red = FAIL',
                  fontweight='bold')
for bar, v, p in zip(bars, vals, passed):
    axes[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                 f'{v:.3f}\n{"PASS" if p else "FAIL"}',
                 ha='center', va='bottom', fontsize=8)
axes[0].legend(handles=[
    mpatches.Patch(facecolor='#4CAF50', label='PASS'),
    mpatches.Patch(facecolor='#F44336', label='FAIL'),
])

bg_stats  = fairness_results['balance_group']['group_stats']
bg_order  = ['Low-Balance', 'Mid-Balance', 'High-Balance']
bg_groups = [g for g in bg_order if g in bg_stats]
bg_tprs   = [bg_stats[g]['tpr'] for g in bg_groups]
bg_colors = ['#F44336', '#FFC107', '#4CAF50']

bars2 = axes[1].bar(bg_groups, bg_tprs,
                    color=bg_colors, edgecolor='black', alpha=0.85)
axes[1].set_ylim(0, 1.25)
axes[1].set_ylabel('True Positive Rate\n(fraud caught per group)')
axes[1].set_title(
'HEADLINE FINDING: Fraud Detection by Balance Tier\n'
'Low-Balance users receive significantly weaker protection',
    fontweight='bold', color='#C62828'
)
axes[1].axhline(0.10, color='red', linestyle=':', lw=1.5,
                label='EOD pass threshold = 0.10')
for bar, g, v in zip(bars2, bg_groups, bg_tprs):
    label = f'{v:.4f}\nWEAK PROTECTION' if v < 0.80 else f'{v:.4f}\nProtected'
    color = '#C62828' if v < 0.80 else 'black'
    axes[1].text(bar.get_x() + bar.get_width() / 2,
                 bar.get_height() + 0.03,
                 label, ha='center', va='bottom',
                 fontsize=9, fontweight='bold', color=color)
axes[1].legend()
plt.tight_layout()
plt.savefig('fig_final_scorecard.png', dpi=150, bbox_inches='tight')
plt.show()

df.to_csv('checkpoint_v2.csv', index=False)
print("Checkpoint v2 saved.")

Checkpoint v2 saved.

Finding	Risk Level	Root Cause	Recommended Action
Finding 1 — Data-level proxy imbalance	MEDIUM	Only 41 Low-Balance fraud cases exist in the full dataset. Structurally underrepresented due to underreporting, attacker incentives, and detection gaps	Systematic effort to label and collect Low-Balance fraud cases. Partner with BoG to access broader incident reporting data
Finding 2a — Transaction type fairness (PASS)	LOW	No material disparity between CASH_OUT and OTHER users	Continue quarterly monitoring to detect drift as fraud patterns evolve
Finding 2b — Balance tier fairness (FAIL)	HIGH — MANDATORY	Approximately 8 Low-Balance fraud cases in test set. Model never learned Low-Balance fraud patterns. EOD = 0.3305, more than three times the threshold	(1) Suspend model per BoG CISD 2026 Annexure E §l(i). (2) Notify BoG per §115(2)(b). (3) Implement tier-specific sub-models, or threshold calibration per balance tier, or post-processing score adjustment, or deliberate resampling targeting Low-Balance fraud specifically
Finding 3 — MCC and Log Loss (PASS)	LOW	Model is well calibrated overall	No action needed. Monitor quarterly
Finding 4 — PR-AUC and KS (PASS)	LOW	Excellent overall discrimination power	Confirms the problem is data, not algorithm. No algorithm change needed
Finding 5 — Error analysis	HIGH	33.3% false negative rate for Low-Balance fraud versus 0.3% for High-Balance — a 110-fold disparity	See Finding 2b recommendations. Error concentration confirms the binary nature of the failure
Finding 6 — SHAP explainability	MEDIUM	newbalanceOrig and oldbalanceOrg — the top two SHAP features — carry implicit wealth-tier correlation confirmed by SHAP analysis	Document in model card. Include in BoG notification. Consider feature normalisation that contextualises balance ratios by account tier
Finding 7 — Deployment readiness	HIGH	Multiple deployment prerequisites unmet	Complete all items in the deployment readiness checklist before any production deployment
Finding 8 — Monitoring plan	MEDIUM	No monitoring framework currently in place	Implement per-group monitoring from first day of deployment. Aggregate-only monitoring is non-compliant with BoG CISD 2026 Annexure E §l(i)

Field	Details
Audit ID	CBA-2026-002
Auditor	Kwadwo Amponsah
Organisation	ClearBoxAI, Kumasi, Ghana
Contact	kwadwoai.com
Dataset	PaySim Synthetic Mobile Money (Lopez-Rojas, Axelsson, Coventry, 2016)
Dataset Note	Real Ghanaian operator transaction data is not publicly available due to confidentiality and proprietary restrictions. PaySim simulates the transaction types, balance mechanics, and fraud patterns of real African mobile money systems. The methodology is directly transferable to any live Ghanaian dataset.
Model	XGBoost Classifier, 300 estimators, random_state=42
Regulatory Frameworks	BoG CISD 2026, EU AI Act 2024/1689, NIST AI RMF 1.0
Fairness Thresholds	ClearBoxAI Internal Standard v1.0 [CBA-STD-001]
Disclaimer	This audit does not constitute legal advice. Compliance determination and regulatory notification obligations rest solely with the Regulated Financial Institution.

FINAL SCORECARD AND RISK REGISTER¶

[Risk register and certification]¶

Risk Register — What Needs to Happen Next¶

Auditor Certification¶