import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

DATA_FILE = 'PS_20174392719_1491204439457_log.csv'
df = pd.read_csv(DATA_FILE)


# ── SECTION 4A: TRANSACTION TYPE ANALYSIS ────────────────────────────────────
print('Analysing fraud distribution by transaction type...\n')

type_summary = df.groupby('type').agg(
    Total=('isFraud', 'count'),
    Fraud=('isFraud', 'sum')
).reset_index()
type_summary['Fraud_Rate_%'] = (
    type_summary['Fraud'] / type_summary['Total'] * 100
).round(4)
type_summary = type_summary.sort_values('Fraud', ascending=False)

print(f'{"Type":<12} {"Total":>10} {"Fraud":>8} {"Fraud Rate":>12}')
print('-' * 48)
for _, row in type_summary.iterrows():
    flag = '  <-- fraud occurs here' if row['Fraud'] > 0 else ''
    print(f'{row["type"]:<12} {int(row["Total"]):>10,} '
          f'{int(row["Fraud"]):>8,} {row["Fraud_Rate_%"]:>11.4f}%{flag}')

print(f'\nKey finding: Fraud is confined to TRANSFER and CASH_OUT exclusively.')
print(f'PAYMENT, CASH_IN, and DEBIT have zero fraud cases across 6.3 million transactions.')

Analysing fraud distribution by transaction type...

Type              Total    Fraud   Fraud Rate
------------------------------------------------
CASH_OUT      2,237,500    4,116      0.1840%  <-- fraud occurs here
TRANSFER        532,909    4,097      0.7688%  <-- fraud occurs here
CASH_IN       1,399,284        0      0.0000%
DEBIT            41,432        0      0.0000%
PAYMENT       2,151,495        0      0.0000%

Key finding: Fraud is confined to TRANSFER and CASH_OUT exclusively.
PAYMENT, CASH_IN, and DEBIT have zero fraud cases across 6.3 million transactions.

# ── SECTION 4A: CHARTS ────────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(
    'Transaction Type Analysis\n'
    'ClearBoxAI Audit CBA-2026-002 | PaySim Mobile Money',
    fontsize=12, fontweight='bold'
)

fraud_types = type_summary[type_summary['Fraud'] > 0]['type'].tolist()
bar_colors  = ['#FF6B6B' if t in fraud_types else '#B0BEC5'
               for t in type_summary['type']]

axes[0].bar(type_summary['type'], type_summary['Fraud_Rate_%'],
            color=bar_colors, edgecolor='black', alpha=0.85)
axes[0].set_title('Fraud Rate by Transaction Type\n(Red = types where fraud occurs)',
                  fontweight='bold')
axes[0].set_ylabel('Fraud Rate (%)')
for i, row in type_summary.reset_index(drop=True).iterrows():
    if row['Fraud_Rate_%'] > 0:
        axes[0].text(i, row['Fraud_Rate_%'] + 0.01,
                     f"{row['Fraud_Rate_%']:.3f}%",
                     ha='center', fontsize=9, fontweight='bold')

axes[1].bar(type_summary['type'], type_summary['Fraud'],
            color=bar_colors, edgecolor='black', alpha=0.85)
axes[1].set_title('Number of Fraud Cases by Transaction Type',
                  fontweight='bold')
axes[1].set_ylabel('Number of Fraud Cases')
for i, row in type_summary.reset_index(drop=True).iterrows():
    if row['Fraud'] > 0:
        axes[1].text(i, row['Fraud'] + 30, f"{int(row['Fraud']):,}",
                     ha='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('fig_eda_01_type.png', dpi=150, bbox_inches='tight')
plt.show()

print('Audit implication: Only TRANSFER and CASH_OUT carry fraud risk.')
print('TRANSFER fraud rate (0.769%) is four times higher than CASH_OUT (0.184%).')
print('TRANSFER is the higher-risk type — important context for the proxy narrative.')

Audit implication: Only TRANSFER and CASH_OUT carry fraud risk.
TRANSFER fraud rate (0.769%) is four times higher than CASH_OUT (0.184%).
TRANSFER is the higher-risk type — important context for the proxy narrative.

# ── SECTION 4B: TIME PATTERN ANALYSIS ────────────────────────────────────────
# step = hour of simulation (1 step = 1 hour, 744 steps = 30 days)
# We extract only hour-of-day patterns since that is the only defensible temporal signal.

df['hour_of_day'] = df['step'] % 24

hourly = df.groupby('hour_of_day')['isFraud'].agg(['sum', 'count'])
hourly['rate'] = hourly['sum'] / hourly['count'] * 100

# --- Plot only hour-of-day pattern ---
plt.figure(figsize=(10, 5))

plt.bar(hourly.index, hourly['rate'],
        color='#2196F3', edgecolor='black', alpha=0.75)

plt.title(
    'Fraud Rate by Hour of Day\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontweight='bold'
)

plt.xlabel('Hour of Day (0 = midnight)')
plt.ylabel('Fraud Rate (%)')
plt.xticks(range(0, 24, 2))

plt.tight_layout()
plt.savefig('fig_eda_02_time.png', dpi=150, bbox_inches='tight')
plt.show()

# --- Key insight ---
peak_hour = hourly['rate'].idxmax()

print('Temporal Fraud Pattern Analysis')
print('=' * 58)
print(f'Peak fraud hour: {peak_hour}:00')

print('\nInterpretation')
print('Fraud activity varies across the 24-hour cycle.')
print('The highest fraud concentration occurs during specific hours, particularly early morning periods.')
print('This pattern suggests fraud is more likely when account holders are less active or less likely to monitor transactions in real time.')

print('\nFeature implication')
print('We retain step as a temporal variable.')
print('We engineer hour_of_day as a model feature to capture intraday fraud patterns.')

Temporal Fraud Pattern Analysis
==========================================================
Peak fraud hour: 5:00

Interpretation
Fraud activity varies across the 24-hour cycle.
The highest fraud concentration occurs during specific hours, particularly early morning periods.
This pattern suggests fraud is more likely when account holders are less active or less likely to monitor transactions in real time.

Feature implication
We retain step as a temporal variable.
We engineer hour_of_day as a model feature to capture intraday fraud patterns.

# ── SECTION 4C: AMOUNT DISTRIBUTION ──────────────────────────────────────────
fraud_amounts = df[df['isFraud'] == 1]['amount']
legit_amounts  = df[df['isFraud'] == 0]['amount']

print('Amount distribution comparison:')
print(f'  Fraudulent: Median GHS {fraud_amounts.median():>12,.0f} | '
      f'Mean GHS {fraud_amounts.mean():>12,.0f} | '
      f'Max GHS {fraud_amounts.max():>14,.0f}')
print(f'  Legitimate: Median GHS {legit_amounts.median():>12,.0f} | '
      f'Mean GHS {legit_amounts.mean():>12,.0f} | '
      f'Max GHS {legit_amounts.max():>14,.0f}')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle(
    'Transaction Amount Distribution: Fraud vs Legitimate (Log Scale)\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontsize=12, fontweight='bold'
)

sample_fraud = fraud_amounts.sample(min(5000, len(fraud_amounts)), random_state=42)
sample_legit = legit_amounts.sample(5000, random_state=42)

axes[0].hist(np.log10(sample_legit + 1), bins=50, alpha=0.6,
             color='#4CAF50', label='Legitimate', density=True)
axes[0].hist(np.log10(sample_fraud + 1), bins=50, alpha=0.7,
             color='#F44336', label='Fraud', density=True)
axes[0].set_title('Amount Distribution (log base 10 scale)', fontweight='bold')
axes[0].set_xlabel('log10(Amount + 1)')
axes[0].set_ylabel('Density')
axes[0].legend()

data_to_plot = [np.log10(sample_legit + 1), np.log10(sample_fraud + 1)]
bp = axes[1].boxplot(data_to_plot, labels=['Legitimate', 'Fraud'],
                     patch_artist=True, notch=True)
bp['boxes'][0].set_facecolor('#4CAF50')
bp['boxes'][1].set_facecolor('#F44336')
for box in bp['boxes']:
    box.set_alpha(0.7)
axes[1].set_title('Amount Boxplot (log base 10 scale)\nMedian, IQR, and outliers',
                  fontweight='bold')
axes[1].set_ylabel('log10(Amount + 1)')

plt.tight_layout()
plt.savefig('fig_eda_03_amount.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nAmount findings:')
print('Fraud amounts are significantly larger than legitimate ones.')
print('This confirms fraud in this dataset is account-draining behaviour,')
print('not small incremental theft. Large amounts and balance ratios will carry')
print('strong predictive signal in the model.')

Amount distribution comparison:
  Fraudulent: Median GHS      441,423 | Mean GHS    1,467,967 | Max GHS     10,000,000
  Legitimate: Median GHS       74,685 | Mean GHS      178,197 | Max GHS     92,445,517

C:\Users\GOD 1st\AppData\Local\Temp\ipykernel_6140\4248690370.py:33: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = axes[1].boxplot(data_to_plot, labels=['Legitimate', 'Fraud'],

Amount findings:
Fraud amounts are significantly larger than legitimate ones.
This confirms fraud in this dataset is account-draining behaviour,
not small incremental theft. Large amounts and balance ratios will carry
strong predictive signal in the model.

# ── SECTION 4D: ZERO BALANCE LOGIC CHECK ─────────────────────────────────────
# This section examines whether fraud transactions are associated with accounts
# ending in zero balance (possible account liquidation behavior).

import matplotlib.pyplot as plt

# --- Basic counts ---
total_fraud_n = df['isFraud'].sum()
total_legit_n = (df['isFraud'] == 0).sum()

fraud_zeroed = ((df['isFraud'] == 1) & (df['newbalanceOrig'] == 0)).sum()
legit_zeroed = ((df['isFraud'] == 0) & (df['newbalanceOrig'] == 0)).sum()

fraud_zero_pct = fraud_zeroed / total_fraud_n * 100
legit_zero_pct = legit_zeroed / total_legit_n * 100

fraud_sweep = ((df['isFraud'] == 1) &
               (df['oldbalanceOrg'] > 0) &
               (df['newbalanceOrig'] == 0)).sum()

# --- Chart (your existing structure) ---
fig, ax = plt.subplots(figsize=(8, 5))

bars = ax.bar(
    ['Legitimate Transactions', 'Fraudulent Transactions'],
    [legit_zero_pct, fraud_zero_pct],
    color=['#4CAF50', '#F44336'],
    edgecolor='black',
    alpha=0.85,
    width=0.4
)

for bar, v in zip(bars, [legit_zero_pct, fraud_zero_pct]):
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 0.3,
        f'{v:.2f}%',
        ha='center',
        fontsize=11,
        fontweight='bold'
    )

ax.set_title(
    'Percentage of Transactions Resulting in Exactly Zero Origin Balance\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontweight='bold'
)
ax.set_ylabel('Percentage of transactions resulting in zero origin balance')

plt.tight_layout()
plt.savefig('fig_eda_04_zero_balance.png', dpi=150, bbox_inches='tight')
plt.show()

# --- Interpretation logic ---
ratio = fraud_zero_pct / legit_zero_pct

print('Zero Balance Logic Check')
print('=' * 58)

print(f'  Fraud cases where origin account hit zero : {fraud_zeroed:,} ({fraud_zero_pct:.2f}% of all fraud)')
print('    Meaning: final balance after the transaction is 0')
print('    This includes two situations:')
print('      Case A: Real fraud drain')
print('        - Account had money before the transaction (example 500)')
print('        - Fraud transaction removes all money')
print('        - Final balance becomes 0')
print('      Case B: Already empty account')
print('        - Account already had 0 before the transaction')
print('        - No money was moved')
print('        - Final balance stays 0')

print(f'\n  Legit cases where origin account hit zero : {legit_zeroed:,} ({legit_zero_pct:.2f}% of all legit)')
print('    Meaning: same case A case B logic as above but legit (final balance is 0)')
print('    This often happens in normal activity such as full payments or transfers')

print(f'\n  Fraud cases with balance wiped to zero    : {fraud_sweep:,}')
print('    Meaning: account started with money greater than 0 and was fully drained to 0')
print('    This excludes already empty accounts (0 to 0 cases)')
print('    This represents true liquidation behavior')

# --- Final interpretation ---
print('\nInterpretation')
print('=' * 58)

print(f'Fraud transactions are more likely to end with a zero balance than legitimate transactions.')
print(f'Fraud: {fraud_zero_pct:.2f}% vs Legit: {legit_zero_pct:.2f}% (approximately {ratio:.2f} times higher).')

print('However, many legitimate transactions also end with zero balance because normal users')
print('often spend or transfer their full account balance, especially in merchant payments.')

print('This means zero balance is not a standalone fraud indicator.')
print('It is a noisy but useful feature that should be combined with other signals.')

# --- Feature implication ---
print('\nFeature implication')
print('=' * 58)

print('\nFeature implication:')
print('We will engineer orig_balance_zeroed as a binary feature in the feature engineering section.')
print('This EDA confirms it will carry strong predictive power.')

Zero Balance Logic Check
==========================================================
  Fraud cases where origin account hit zero : 8,053 (98.05% of all fraud)
    Meaning: final balance after the transaction is 0
    This includes two situations:
      Case A: Real fraud drain
        - Account had money before the transaction (example 500)
        - Fraud transaction removes all money
        - Final balance becomes 0
      Case B: Already empty account
        - Account already had 0 before the transaction
        - No money was moved
        - Final balance stays 0

  Legit cases where origin account hit zero : 3,601,513 (56.68% of all legit)
    Meaning: same case A case B logic as above but legit (final balance is 0)
    This often happens in normal activity such as full payments or transfers

  Fraud cases with balance wiped to zero    : 8,012
    Meaning: account started with money greater than 0 and was fully drained to 0
    This excludes already empty accounts (0 to 0 cases)
    This represents true liquidation behavior

Interpretation
==========================================================
Fraud transactions are more likely to end with a zero balance than legitimate transactions.
Fraud: 98.05% vs Legit: 56.68% (approximately 1.73 times higher).
However, many legitimate transactions also end with zero balance because normal users
often spend or transfer their full account balance, especially in merchant payments.
This means zero balance is not a standalone fraud indicator.
It is a noisy but useful feature that should be combined with other signals.

Feature implication
==========================================================

Feature implication:
We will engineer orig_balance_zeroed as a binary feature in the feature engineering section.
This EDA confirms it will carry strong predictive power.

# ── SECTION 4E: FEATURE CORRELATION HEATMAP ──────────────────────────────────
numeric_cols = [
    'amount', 'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest', 'isFraud'
]

corr_matrix = df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

fig, ax = plt.subplots(figsize=(9, 7))
sns.heatmap(
    corr_matrix, mask=mask, annot=True, fmt='.2f',
    cmap='RdBu_r', center=0, vmin=-1, vmax=1,
    linewidths=0.5, ax=ax, annot_kws={'size': 10}
)
ax.set_title(
    'Feature Correlation Matrix (Pearson)\n'
    'ClearBoxAI Audit CBA-2026-002',
    fontweight='bold', pad=15
)
plt.tight_layout()
plt.savefig('fig_eda_05_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

print('High correlation pairs (absolute r above 0.7, excluding target):')
found = False
for i, col_a in enumerate(numeric_cols[:-1]):
    for col_b in numeric_cols[i+1:-1]:
        r = abs(corr_matrix.loc[col_a, col_b])
        if r > 0.7:
            print(f'  {col_a} and {col_b} : r = {r:.3f}')
            found = True
if not found:
    print('  No problematic collinearity found.')

print('\nConclusion: No destructive collinearity detected.')
print('We will engineer difference-based features (balance deltas) that capture')
print('how much was moved more directly than the raw balance values alone.')
print('XGBoost handles correlated features naturally through tree-based splitting.')

High correlation pairs (absolute r above 0.7, excluding target):
  oldbalanceOrg and newbalanceOrig : r = 0.999
  oldbalanceDest and newbalanceDest : r = 0.977

Conclusion: No destructive collinearity detected.
We will engineer difference-based features (balance deltas) that capture
how much was moved more directly than the raw balance values alone.
XGBoost handles correlated features naturally through tree-based splitting.

# ── SECTION 4F: EDA SUMMARY — 3-PANEL OVERVIEW ───────────────────────────────
tier_col  = pd.cut(df['oldbalanceOrg'], bins=bins_proxy, labels=labels_proxy)
tier_rates = []
for tier in labels_proxy:
    mask = tier_col == tier
    tier_rates.append(df.loc[mask, 'isFraud'].mean() * 100)
bg_ordered = pd.Series(tier_rates, index=labels_proxy)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(
    'Section 4 — Exploring the Dataset\n'
    'ClearBoxAI Audit CBA-2026-002 | PaySim Mobile Money',
    fontsize=13, fontweight='bold'
)

# Chart 1: Fraud vs Legitimate
labels_pie = [f'Legitimate\n({df["isFraud"].value_counts()[0]/1e6:.2f}M)',
              f'Fraud\n({df["isFraud"].sum():,})']
axes[0].pie(
    [df['isFraud'].value_counts()[0], df['isFraud'].value_counts()[1]],
    labels=labels_pie, colors=['#4CAF50', '#F44336'],
    autopct='%1.3f%%', startangle=90, textprops={'fontsize': 10}
)
axes[0].set_title('Fraud vs Legitimate\n(Full Dataset)', fontweight='bold')

# Chart 2: Fraud rate by type
type_fraud = df.groupby('type')['isFraud'].mean() * 100
colors2    = ['#FF6B6B' if t in ['CASH_OUT', 'TRANSFER'] else '#B0BEC5'
              for t in type_fraud.index]
type_fraud.plot(kind='bar', ax=axes[1], color=colors2, edgecolor='black')
axes[1].set_title('Fraud Rate by Transaction Type\n(Red = types with fraud)',
                  fontweight='bold')
axes[1].set_xlabel('')
axes[1].set_ylabel('Fraud Rate (%)')
axes[1].set_xticklabels(type_fraud.index, rotation=30, ha='right')
for p in axes[1].patches:
    if p.get_height() > 0:
        axes[1].annotate(f'{p.get_height():.3f}%',
                         (p.get_x() + p.get_width() / 2, p.get_height()),
                         ha='center', va='bottom', fontsize=8)

# Chart 3: Fraud rate by balance tier
bg_ordered.plot(kind='bar', ax=axes[2],
                color=['#FF6B6B', '#FFC107', '#4CAF50'], edgecolor='black')
axes[2].set_title('Fraud Rate by Account Balance Tier\n(Low = lowest-balance users)',
                  fontweight='bold')
axes[2].set_xlabel('Balance Group')
axes[2].set_ylabel('Fraud Rate (%)')
axes[2].set_xticklabels(labels_proxy, rotation=0)
for p in axes[2].patches:
    axes[2].annotate(f'{p.get_height():.4f}%',
                     (p.get_x() + p.get_width() / 2, p.get_height()),
                     ha='center', va='bottom', fontsize=8)

plt.tight_layout(rect=[0, 0, 1, 0.88])
plt.savefig('fig_01_eda_summary.png', dpi=150, bbox_inches='tight')
plt.show()

# ── SECTION 4G: THE INVISIBLE 41 ─────────────────────────────────────────────
print('Calculating global balance boundaries (Pass 1 of 2)...')
full_balance = pd.read_csv(DATA_FILE, usecols=['oldbalanceOrg'])
q1_scan = full_balance['oldbalanceOrg'].quantile(0.33)
q2_scan = full_balance['oldbalanceOrg'].quantile(0.66)

bins_scan   = [-np.inf, q1_scan, q2_scan, np.inf]
labels_scan = ['Low-Balance', 'Mid-Balance', 'High-Balance']
print(f'Boundaries: Low <= {q1_scan:.2f} | Mid <= {q2_scan:.2f} | High > {q2_scan:.2f}')

print('\nScanning fraud distribution by tier (Pass 2 of 2)...')
fraud_counts_scan = {l: 0 for l in labels_scan}
total_fraud_scan  = 0

for chunk in pd.read_csv(DATA_FILE, chunksize=500_000):
    total_fraud_scan += chunk['isFraud'].sum()
    chunk['tier'] = pd.cut(chunk['oldbalanceOrg'], bins=bins_scan, labels=labels_scan)
    for lbl in labels_scan:
        fraud_counts_scan[lbl] += chunk[chunk['tier'] == lbl]['isFraud'].sum()

print('\n' + '=' * 62)
print(f'{"ECONOMIC TIER":<25} | {"FRAUD CASES":<12} | {"SHARE OF TOTAL"}')
print('=' * 62)
for group in labels_scan:
    share = (fraud_counts_scan[group] / total_fraud_scan) * 100
    print(f'{group:<25} | {int(fraud_counts_scan[group]):>12,} | {share:>14.2f}%')
print('=' * 62)

Calculating global balance boundaries (Pass 1 of 2)...
Boundaries: Low <= 0.00 | Mid <= 50397.00 | High > 50397.00

Scanning fraud distribution by tier (Pass 2 of 2)...

==============================================================
ECONOMIC TIER             | FRAUD CASES  | SHARE OF TOTAL
==============================================================
Low-Balance               |           41 |           0.50%
Mid-Balance               |        1,022 |          12.44%
High-Balance              |        7,150 |          87.06%
==============================================================

EXPLORATORY DATA ANALYSIS¶

EDA findings and The Invisible 41¶

EDA Summary Findings¶

The Discovery — The Invisible 41¶

Balance Group	Fraud Cases	Share of All Fraud
High-Balance	7,150	87.06%
Mid-Balance	1,022	12.44%
Low-Balance	41	0.50%