import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

DATA_FILE = 'PS_20174392719_1491204439457_log.csv'
df = pd.read_csv(DATA_FILE)


# ── SECTION 5: FEATURE ENGINEERING ───────────────────────────────────────────
print('Step 1: Encoding transaction type...')

le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

print(f'  Encoding map: {dict(zip(le.classes_, le.transform(le.classes_)))}')
print(f'  Method: Label Encoding (not one-hot)')
print(f'  Reason: XGBoost uses tree-based splits and does not need ordinal separation.')

print('\nStep 2: Engineering balance-derived features...')
print('  These capture the behavioural signature of fraud.')
print('  Fraud often involves abnormal movement or disappearance of funds.\n')

# Balance movement features
df['balance_diff_orig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balance_diff_dest'] = df['newbalanceDest'] - df['oldbalanceDest']

print('  balance_diff_orig: Drop in sender balance.')
print('  Large positive values may indicate account drainage.')

df['amount_to_orig_balance'] = np.where(
    df['oldbalanceOrg'] > 0,
    df['amount'] / (df['oldbalanceOrg'] + 1e-6),
    0
)

print('\n  amount_to_orig_balance: Fraction of sender balance being moved.')
print('  Values close to 1.0 indicate full account depletion.')

df['orig_balance_zeroed'] = (df['newbalanceOrig'] == 0).astype(int)

print('\n  orig_balance_zeroed: Did sender account end at zero?')
print('  Captures account depletion behavior seen more often in fraud.')

df['dest_balance_zeroed'] = (df['oldbalanceDest'] == 0).astype(int)

print('\n  dest_balance_zeroed: Was destination empty before receiving funds?')

print('\n  hour_of_day: Already engineered in E.D.A section.')
print('  Captures time-based fraud patterns within a 24-hour cycle.')

print('\nStep 3: Defining model features...')

FEATURES = [
    'step', 'type_encoded', 'amount',
    'hour_of_day',
    'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest',
    'balance_diff_orig', 'balance_diff_dest',
    'amount_to_orig_balance',
    'orig_balance_zeroed', 'dest_balance_zeroed'
]

print(f'  Total features: {len(FEATURES)}')

for f in FEATURES:
    print(f'    {f}')


# EXCLUDED features — document why
EXCLUDED = {
    'isFlaggedFraud': 'Data leakage — another system output on same data',
    'nameOrig':       'Customer ID — overfitting and privacy risk (BoG CISD 2026 §99)',
    'nameDest':       'Customer ID — overfitting and privacy risk (BoG CISD 2026 §99)',
    'type':           'Raw string replaced by type_encoded (numeric version)',
    'tx_type_group':  'Proxy variable — fairness testing only, never a model feature',
    'balance_group':  'Proxy variable — fairness testing only, never a model feature',
}

print('\nExcluded from model (with reasons):')
for col, reason in EXCLUDED.items():
    print(f'  {col:<20} {reason}')

print('\nFeature engineering complete.')

Step 1: Encoding transaction type...
  Encoding map: {'CASH_IN': np.int64(0), 'CASH_OUT': np.int64(1), 'DEBIT': np.int64(2), 'PAYMENT': np.int64(3), 'TRANSFER': np.int64(4)}
  Method: Label Encoding (not one-hot)
  Reason: XGBoost uses tree-based splits and does not need ordinal separation.

Step 2: Engineering balance-derived features...
  These capture the behavioural signature of fraud.
  Fraud often involves abnormal movement or disappearance of funds.

  balance_diff_orig: Drop in sender balance.
  Large positive values may indicate account drainage.

  amount_to_orig_balance: Fraction of sender balance being moved.
  Values close to 1.0 indicate full account depletion.

  orig_balance_zeroed: Did sender account end at zero?
  Captures account depletion behavior seen more often in fraud.

  dest_balance_zeroed: Was destination empty before receiving funds?

  hour_of_day: Already engineered in E.D.A section.
  Captures time-based fraud patterns within a 24-hour cycle.

Step 3: Defining model features...
  Total features: 13
    step
    type_encoded
    amount
    hour_of_day
    oldbalanceOrg
    newbalanceOrig
    oldbalanceDest
    newbalanceDest
    balance_diff_orig
    balance_diff_dest
    amount_to_orig_balance
    orig_balance_zeroed
    dest_balance_zeroed

Excluded from model (with reasons):
  isFlaggedFraud       Data leakage — another system output on same data
  nameOrig             Customer ID — overfitting and privacy risk (BoG CISD 2026 §99)
  nameDest             Customer ID — overfitting and privacy risk (BoG CISD 2026 §99)
  type                 Raw string replaced by type_encoded (numeric version)
  tx_type_group        Proxy variable — fairness testing only, never a model feature
  balance_group        Proxy variable — fairness testing only, never a model feature

Feature engineering complete.

FEATURE ENGINEERING¶

Feature Engineering Audit Summary¶