import pandas as pd
import numpy as np
from scipy import stats

DATA_FILE = 'PS_20174392719_1491204439457_log.csv'
df = pd.read_csv(DATA_FILE)

# ── SECTION 3: PROXY VARIABLE ENGINEERING ─────────────────────────────────────
print('Engineering proxy columns...')

# Proxy 1: Binary economic role
# CASH_OUT = informal economy users (market traders, gig workers, smallholder farmers)
# OTHER    = broader mix of formal and informal activity
df['tx_type_group'] = df['type'].apply(
    lambda x: 'CASH_OUT' if x == 'CASH_OUT' else 'OTHER'
)

# Proxy 2: Wealth tier using fixed GHS boundaries
# These boundaries are derived from the actual data distribution and held constant
# across all sections to ensure consistency
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

df['balance_group'] = pd.cut(
    df['oldbalanceOrg'],
    bins=bins_proxy,
    labels=labels_proxy
)

print(f'  tx_type_group  : {df["tx_type_group"].value_counts().to_dict()}')
print(f'  balance_group  : {df["balance_group"].value_counts().to_dict()}')
print('Proxy columns created.')

Engineering proxy columns...
  tx_type_group  : {'OTHER': 4125120, 'CASH_OUT': 2237500}
  balance_group  : {'High-Balance': 2163289, 'Low-Balance': 2102449, 'Mid-Balance': 2096882}
Proxy columns created.

# ── SECTION 3: CHI-SQUARE AND CRAMER'S V ──────────────────────────────────────

def cramers_v(ct):
    """
    Calculates Cramer's V with bias correction.
    Measures association strength between 0 and 1.
    1 = perfect association, 0 = no association.
    """
    chi2 = stats.chi2_contingency(ct)[0]
    n    = ct.sum()
    phi2 = chi2 / n
    r, k = ct.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr    = r - ((r - 1) ** 2) / (n - 1)
    kcorr    = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


proxy_results = {}

for proxy, label in [
    ('tx_type_group', 'Transaction Type (Economic Role Proxy)'),
    ('balance_group',  'Account Balance Tier (Wealth Proxy)')
]:
    print(f'\n{"=" * 58}')
    print(f'Proxy: {label}')
    print('=' * 58)

    ct_raw = pd.crosstab(df[proxy], df['isFraud'])
    chi2, p, dof, _ = stats.chi2_contingency(ct_raw.values)
    v = cramers_v(ct_raw.values)

    ct_display = ct_raw.copy()
    ct_display.columns = ['Legitimate', 'Fraud']
    ct_display['Total']       = ct_display.sum(axis=1)
    ct_display['Fraud_Rate_%'] = (
        ct_display['Fraud'] / ct_display['Total'] * 100
    ).round(4)

    print('\nCrosstab results:')
    print(ct_display.to_string())

    strength = ('NEGLIGIBLE' if v < 0.10 else
                'WEAK'       if v < 0.20 else
                'MODERATE'   if v < 0.40 else 'STRONG')

    sig = p < 0.05
    print(f'\nStatistical Test Results:')
    print(f'  Chi-square statistic : {chi2:>12,.2f}')
    print(f'  p-value              : {p:>12.2e}')
    print(f"  Cramer's V           : {v:>12.4f}")
    print(f'  Significance         : {"SIGNIFICANT (real, not random)" if sig else "NOT significant"}')
    print(f'  Association strength : {strength}')

    proxy_results[proxy] = {
        'chi2': chi2, 'p': p, 'v': v,
        'strength': strength, 'ct': ct_display
    }

print('\nProxy detection complete.')

==========================================================
Proxy: Transaction Type (Economic Role Proxy)
==========================================================

Crosstab results:
               Legitimate  Fraud    Total  Fraud_Rate_%
tx_type_group                                          
CASH_OUT          2233384   4116  2237500        0.1840
OTHER             4121023   4097  4125120        0.0993

Statistical Test Results:
  Chi-square statistic :       805.43
  p-value              :    3.57e-177
  Cramer's V           :       0.0112
  Significance         : SIGNIFICANT (real, not random)
  Association strength : NEGLIGIBLE

==========================================================
Proxy: Account Balance Tier (Wealth Proxy)
==========================================================

Crosstab results:
               Legitimate  Fraud    Total  Fraud_Rate_%
balance_group                                          
Low-Balance       2102408     41  2102449        0.0020
Mid-Balance       2095860   1022  2096882        0.0487
High-Balance      2156139   7150  2163289        0.3305

Statistical Test Results:
  Chi-square statistic :    10,494.67
  p-value              :     0.00e+00
  Cramer's V           :       0.0406
  Significance         : SIGNIFICANT (real, not random)
  Association strength : NEGLIGIBLE

Proxy detection complete.

PRE-TRAINING PROXY DETECTION¶

[Proxy detection findings]¶

What These Numbers Are Telling Us¶