# ── SECTION 1: LOAD CHECKPOINT ───────────────────────────────────────────────

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
bins_proxy   = [-np.inf, 0.0, 50397.0, np.inf]
labels_proxy = ['Low-Balance', 'Mid-Balance', 'High-Balance']

df = pd.read_csv('checkpoint.csv')

print(f'Checkpoint loaded: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(df.columns.tolist())

Checkpoint loaded: 6,362,620 rows x 11 columns
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

# ── DEFINITIONS & FEATURE ENGINEERING ────────────────────────────────

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

RANDOM_STATE = 42

# 1. Create the 'balance_group' and 'tx_type_group' features used in Section 6
# (Based on the bins/labels you defined at the top of your script)
df['balance_group'] = pd.cut(df['oldbalanceOrg'], bins=bins_proxy, labels=labels_proxy)

# Simple grouping for tx_type: Fraud usually happens in TRANSFER or CASH_OUT
df['tx_type_group'] = df['type'].apply(lambda x: 'High-Risk' if x in ['TRANSFER', 'CASH_OUT'] else 'Low-Risk')

# 2. Define which columns the model will actually use (FEATURES)
# We exclude names (privacy) and the target variable (isFraud)
FEATURES = [
    'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
    'oldbalanceDest', 'newbalanceDest'
]

# Note: Since XGBoost handles numbers, if you want to include 'type', 
# you'd need to encode it first. For now, these numeric features are the core.
print("Feature Engineering complete. 'FEATURES' list defined.")

Feature Engineering complete. 'FEATURES' list defined.

# ── SECTION 6: MODEL TRAINING AUDIT ──────────────────────────────────────────

X         = df[FEATURES].copy()
y         = df['isFraud'].copy()
sensitive = df[['tx_type_group', 'balance_group']].copy()

print('Step 1: Train/Test Split (80% train, 20% test)')
print('  stratify=y ensures both halves preserve the same fraud ratio.')
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(
    X, y, sensitive,
    test_size=0.20, random_state=RANDOM_STATE, stratify=y
)
print(f'  Training set : {len(X_train):>10,} rows | Fraud: {y_train.sum():,}')
print(f'  Test set     : {len(X_test):>10,} rows | Fraud: {y_test.sum():,}')

# Count Low-Balance fraud in training set for our narrative
lb_train_fraud = y_train[s_train['balance_group'].astype(str) == 'Low-Balance'].sum()
hb_train_fraud = y_train[s_train['balance_group'].astype(str) == 'High-Balance'].sum()
print(f'\n  Low-Balance fraud cases in training   : {lb_train_fraud}')
print(f'  High-Balance fraud cases in training  : {hb_train_fraud:,}')
print(f'  Ratio: {hb_train_fraud/lb_train_fraud:.0f} High-Balance fraud examples for every 1 Low-Balance.')
print(f'\n  This confirms the structural problem identified in the EDA Section.')
print(f'  SMOTE will be applied next but cannot close this gap.')

print('\nStep 2: SMOTE (synthetic fraud examples for training data only)')
print('  sampling_strategy=0.1 means fraud will be ~10% of training set after SMOTE.')
smote = SMOTE(random_state=RANDOM_STATE, sampling_strategy=0.1)
X_tr, y_tr = smote.fit_resample(X_train, y_train)
print(f'  Before SMOTE: {len(X_train):,} training rows | Fraud: {y_train.sum():,}')
print(f'  After SMOTE : {len(X_tr):,} training rows | Fraud: {y_tr.sum():,}')
print(f'  Note: SMOTE interpolates between existing fraud cases.')
print(f'  Since 87% of fraud is High-Balance, most synthetic cases are High-Balance patterns.')
print(f'  Low-Balance fraud remains severely underrepresented even after SMOTE.')

print('\nStep 3: Training XGBoost (300 trees)')
neg = (y_tr == 0).sum()
pos = (y_tr == 1).sum()

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=neg / pos,
    eval_metric='aucpr',
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbosity=0
)
model.fit(X_tr, y_tr)
print(f'  Model trained. 300 decision trees.')

print('\nStep 4: Generating predictions on test set')
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(f'  Model flagged : {y_pred.sum():,} transactions as fraud')
print(f'  Actual fraud  : {y_test.sum():,} transactions')
print('\nModel training complete. Ready for audit.')

Step 1: Train/Test Split (80% train, 20% test)
  stratify=y ensures both halves preserve the same fraud ratio.
  Training set :  5,090,096 rows | Fraud: 6,570
  Test set     :  1,272,524 rows | Fraud: 1,643

  Low-Balance fraud cases in training   : 35
  High-Balance fraud cases in training  : 5,738
  Ratio: 164 High-Balance fraud examples for every 1 Low-Balance.

  This confirms the structural problem identified in the EDA Section.
  SMOTE will be applied next but cannot close this gap.

Step 2: SMOTE (synthetic fraud examples for training data only)
  sampling_strategy=0.1 means fraud will be ~10% of training set after SMOTE.
  Before SMOTE: 5,090,096 training rows | Fraud: 6,570
  After SMOTE : 5,591,878 training rows | Fraud: 508,352
  Note: SMOTE interpolates between existing fraud cases.
  Since 87% of fraud is High-Balance, most synthetic cases are High-Balance patterns.
  Low-Balance fraud remains severely underrepresented even after SMOTE.

Step 3: Training XGBoost (300 trees)
  Model trained. 300 decision trees.

Step 4: Generating predictions on test set
  Model flagged : 8,248 transactions as fraud
  Actual fraud  : 1,643 transactions

Model training complete. Ready for audit.

df.to_csv('checkpoint_v2.csv', index=False)
print("Checkpoint v2 saved.")

Checkpoint v2 saved.

Model Training¶