import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import geopy.distance as distance
from geopy import Point

pd.set_option('display.max_column', None)
pd.set_option('display.max_row', None)
import warnings
warnings.filterwarnings('ignore')

from config import TEST_DATA_PATH, TRAIN_DATA_PATH
train = pd.read_csv(TEST_DATA_PATH, parse_dates=['trans_date_trans_time', 'dob'])
test = pd.read_csv(TRAIN_DATA_PATH, parse_dates=['trans_date_trans_time', 'dob'])
data = pd.concat([train, test], ignore_index=False)
data.drop(columns=['Unnamed: 0'], inplace=True)

from IPython.display import display, HTML

# Function to display dataframes side by side
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html() + '\xa0\xa0\xa0'  # '\xa0' is a non-breaking space
    display(HTML('<div style="display: flex;">' + html_str + '</div>'))

print(data.info())
print(f"Number of columns that have missing values: {data.isna().any().sum()}")

<class 'pandas.core.frame.DataFrame'>
Index: 1852394 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   merchant               object        
 3   category               object        
 4   amt                    float64       
 5   first                  object        
 6   last                   object        
 7   gender                 object        
 8   street                 object        
 9   city                   object        
 10  state                  object        
 11  zip                    int64         
 12  lat                    float64       
 13  long                   float64       
 14  city_pop               int64         
 15  job                    object        
 16  dob                    datetime64[ns]
 17  trans_num              object        
 18  unix_time              int64         
 19  merch_lat              float64       
 20  merch_long             float64       
 21  is_fraud               int64         
dtypes: datetime64[ns](2), float64(5), int64(5), object(10)
memory usage: 325.1+ MB
None
Number of columns that have missing values: 0

#Getting first and last fraud transaction of each cc_num
first_fraud = data[data['is_fraud'] == 1].groupby("cc_num")["trans_date_trans_time"].min().reset_index().rename(columns={'trans_date_trans_time': 'first_fraud'})
last_fraud = data[data['is_fraud'] == 1].groupby("cc_num")["trans_date_trans_time"].max().reset_index().rename(columns={'trans_date_trans_time': 'last_fraud'})

#Filter records that between first fraud transaction and last fraud transaction of each cc_num 
data = data.merge(first_fraud, on='cc_num', how='left').merge(last_fraud, on='cc_num', how='left')
date_filter = (data['trans_date_trans_time'] >= data['first_fraud']) & (data['trans_date_trans_time'] <= data['last_fraud'])

data[date_filter][data['is_fraud'] == 0]

#Sort values ​​to calculate time since last order
data = data.sort_values(['cc_num', 'trans_date_trans_time'])

#Lag transaction time
data['lag'] = data.groupby('cc_num')['trans_date_trans_time'].shift(1)

#Convert timedelta to hours
data['diff'] = (data['trans_date_trans_time'] - data['lag'])/np.timedelta64(1, 'h')

print(f"Average time between transaction of fraudulent users: {data[data['is_fraud'] == 1]['diff'].mean()}\
        \nAverage time between transaction of non-fraudulent users: {data[data['is_fraud'] == 0]['diff'].mean()}")

Average time between transaction of fraudulent users: 5.868483873779637        
Average time between transaction of non-fraudulent users: 8.608448378021551

import itertools
palette = itertools.cycle(sns.color_palette("flare"))

#Extract Month from transaction time
data['month']=pd.DatetimeIndex(data['trans_date_trans_time']).month

#Extract Day from transaction time
data['day']=pd.DatetimeIndex(data['trans_date_trans_time']).day

#Extract Hour from transaction time
data['hour']=pd.DatetimeIndex(data['trans_date_trans_time']).hour

#Create subplots
fig, ax = plt.subplots(1, 3, figsize=(24, 6), gridspec_kw={'width_ratios': [0.8, 1.2, 1]})
fig.suptitle('Distribution of Fraud Transactions by Month, Day and Hour')

time_list = ['month', 'day', 'hour']
for x in time_list:
    c = next(palette)
    sns.barplot(ax=ax[time_list.index(x)], x=x, y='trans_num', color=c,
                data=data[data["is_fraud"]==1].groupby(x)["trans_num"].count().reset_index()) #Count the number of fraudulent transactions occurring in months, days and hours
    ax[time_list.index(x)].set_title(x)
    ax[time_list.index(x)].set_ylabel('Number of Transactions')
    plt.xlabel(x)

print(data.groupby("cc_num")['is_fraud'].max().reset_index().groupby("is_fraud").count())

          cc_num
is_fraud        
0             23
1            976

#Calculate fraud rate by job and visualize on scatter plot between number of transactions and fraud rate group by job
sns.scatterplot(data=data.groupby('job').agg({'is_fraud':'mean', 'trans_num':'count'}), x="is_fraud", y="trans_num")
print(data.groupby('job').agg({'is_fraud':'mean', 'trans_num':'count'}).sort_values(by="trans_num",ascending=True).head(30))
plt.title("Number of Transactions by Fraud label")
plt.xlabel("Fraud Label")
plt.ylabel("Number of Transactions")
plt.show()

                                   is_fraud  trans_num
job                                                   
Ship broker                        1.000000          7
Warehouse manager                  1.000000          7
Contracting civil engineer         1.000000          7
Armed forces technical officer     1.000000          8
Engineer, water                    1.000000          8
Information officer                1.000000          8
Veterinary surgeon                 1.000000          8
Forest/woodland manager            1.000000          9
Broadcast journalist               1.000000          9
Industrial buyer                   1.000000         10
Solicitor                          1.000000         11
Accountant, chartered              1.000000         11
Software engineer                  1.000000         11
Homeopath                          1.000000         11
Operational investment banker      1.000000         11
Personnel officer                  1.000000         12
Legal secretary                    1.000000         12
Engineer, site                     1.000000         12
Sales promotion account executive  1.000000         14
Careers adviser                    1.000000         15
Air traffic controller             1.000000         17
Dancer                             1.000000         19
Engineer, materials                0.009576        731
Merchandiser, retail               0.008197        732
Local government officer           0.005464        732
Estate manager/land agent          0.008186        733
Investment banker, operational     0.009550        733
Water quality scientist            0.009537        734
Professor Emeritus                 0.005450        734
Engineer, civil (consulting)       0.012262        734

#Generate new column based on jobs that has fraud rate = 100% and other jobs
fraud_rate = data.groupby('job').agg({'is_fraud': 'mean', 'trans_num': 'count'}).reset_index()
high_rate_jobs = list(fraud_rate[fraud_rate["is_fraud"] == 1]["job"])
data["job"] = data["job"].apply(lambda x: 'Other' if x not in high_rate_jobs else x)

#Calculate fraud rate and the average amount of money by product category
print("Average fraud rate of each category:\n", data.groupby('category')['is_fraud'].mean(),
        "\nAverage amount spent in each category for fraudulent group:\n", data[data['is_fraud'] == 1].groupby("category")['amt'].mean(), 
        "\nAverage amount spent in each category for non-fraudulent group:\n", data[data['is_fraud'] == 0].groupby("category")['amt'].mean())

Average fraud rate of each category:
 category
entertainment     0.002177
food_dining       0.001568
gas_transport     0.004106
grocery_net       0.002697
grocery_pos       0.012645
health_fitness    0.001510
home              0.001510
kids_pets         0.001880
misc_net          0.013039
misc_pos          0.002819
personal_care     0.002229
shopping_net      0.015927
shopping_pos      0.006344
travel            0.002692
Name: is_fraud, dtype: float64 
Average amount spent in each category for fraudulent group:
 category
entertainment     504.794384
food_dining       120.678829
gas_transport      12.231256
grocery_net        12.046914
grocery_pos       312.237262
health_fitness     20.277622
home              257.479094
kids_pets          18.485296
misc_net          798.654577
misc_pos          212.716894
personal_care      26.110207
shopping_net      998.128725
shopping_pos      878.913201
travel              8.970962
Name: amt, dtype: float64 
Average amount spent in each category for non-fraudulent group:
 category
entertainment      63.181494
food_dining        50.884652
gas_transport      63.688542
grocery_net        53.801150
grocery_pos       114.135071
health_fitness     54.139079
home               57.886732
kids_pets          57.601378
misc_net           70.689738
misc_pos           62.252334
personal_care      48.096911
shopping_net       72.194496
shopping_pos       73.799294
travel            112.048612
Name: amt, dtype: float64

# Divide between categories with low transaction value in fraud transactions
low_value_cat = ['grocery_net', 'gas_transport', 'kids_pets', 'personal_care', 'health_fitness', 'travel']
high_value_cat = ['entertainment', 'food_dining', 'home', 'grocery_pos', 'misc_net', 'shopping_net', 'shopping_pos', 'misc_pos']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,10))
fig.suptitle('Distribution of Amounts by Category')

# Visualize low value category on boxplot
sns.boxplot(ax=ax1, data=data[data['category'].isin(low_value_cat)], x="amt", y="category", whis=[0.1, 99.9], width=1, hue="is_fraud", fill=True, gap=.35, fliersize=0)
ax1.set_title("Low value Category")
ax1.set_xlabel('amount')
ax1.set_xlim(-10, 400)

# Visualize high value category on boxplot
sns.boxplot(ax=ax2, data=data[data['category'].isin(high_value_cat)], x="amt", y="category", whis=[0.1, 99.9], width=1, hue="is_fraud", fill=True, gap=.35, fliersize=0)
ax2.set_title("High value Category")
ax2.set_xlabel('amount')
ax2.set_xlim(-50, 2000)
plt.show()

#Calculate fraud rate by state
print(data.groupby("state").agg({'is_fraud':'mean', 'trans_num':'count'}).sort_values(by='is_fraud', ascending=False).head(10))

       is_fraud  trans_num
state                     
DE     1.000000          9
RI     0.020134        745
AK     0.016875       2963
OR     0.007460      26408
NH     0.006737      11727
VA     0.006538      41756
TN     0.006382      24913
NE     0.006275      34425
MN     0.006163      45433
NY     0.006113     119419

#Calculate fraud rate by merchant
print(data.groupby("merchant").agg({'is_fraud':'mean', 'trans_num':'count'}).sort_values(by='is_fraud', ascending=False).head(10))

                                            is_fraud  trans_num
merchant                                                       
fraud_Kozey-Boehm                           0.021755       2758
fraud_Herman, Treutel and Dickens           0.020321       1870
fraud_Terry-Huel                            0.019553       2864
fraud_Kerluke-Abshire                       0.018975       2635
fraud_Mosciski, Ziemann and Farrell         0.018788       2821
fraud_Schmeler, Bashirian and Price         0.018651       2788
fraud_Kuhic LLC                             0.018649       2842
fraud_Jast Ltd                              0.018498       2757
fraud_Langworth, Boehm and Gulgowski        0.018459       2817
fraud_Romaguera, Cruickshank and Greenholt  0.018432       2767

#Credit card type based on number of numbers
data['cc_type'] = data['cc_num'].astype(str).apply(len)

#Calculate Age of users based on dob
data['age'] = ((data.trans_date_trans_time - data.dob)/np.timedelta64(1, 'D')).round(0) / 365

#Calculate the distance from the user's address to the merchant's address based on latitude and longitude
data['user_coords'] = data.apply(lambda row: Point(latitude=row['lat'], longitude=row['long']), axis=1)
data['merch_coords'] = data.apply(lambda row: Point(latitude=row['merch_lat'], longitude=row['merch_long']), axis=1)
data['distance'] = data.apply(lambda row: distance.distance(row['user_coords'], row['merch_coords']).km, axis=1)

#Check the correlation between the 3 newly created columns and the is_fraud column
sns.heatmap(data=data[['distance', 'age', 'cc_type', 'is_fraud']].corr(), annot=True, cmap='crest', fmt=".4f", vmax=0.05)
plt.title("Correlation of 3 features with the target variable")
plt.show()

fit_data = data[['amt', 'hour', 'category', 'job', 'merch_lat', 'merch_long', 'lat', 'long', 
                'is_fraud', 'diff', 'age', 'city_pop', 'month', 'day']]

# One-hot encoding
fit_data = pd.get_dummies(fit_data, columns=['job', 'category'])

#Split X and y
X = fit_data.drop('is_fraud', axis=1).values
y = fit_data['is_fraud'].values

# Train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=41)

#Building DecisionTree and calculate recall score
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

# Define the KFold cross-validator
kf = KFold(n_splits=5, random_state=3, shuffle=True)

# Perform KFold cross-validation
scores = cross_val_score(tree, X_train, y_train, cv=kf, scoring='f1')

# Print the recall score for each fold
for i, score in enumerate(scores, 1):
    print(f"F1 Score for Fold {i}: {score}")

# Fit the model and make predictions
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

print(f"Recall Score for DecisionTree: {recall_score(y_test, tree_pred)}")
print(f"Confusion Matrix for DecisionTree: \n{confusion_matrix(y_test, tree_pred)}")
print(f"Classification Report for DecisionTree: \n{classification_report(y_test, tree_pred)}")

F1 Score for Fold 1: 0.7910288358846564
F1 Score for Fold 2: 0.8056318681318682
F1 Score for Fold 3: 0.8057029177718833
F1 Score for Fold 4: 0.7888198757763976
F1 Score for Fold 5: 0.775
Recall Score for DecisionTree: 0.814512841418671
Confusion Matrix for DecisionTree: 
[[460143    503]
 [   455   1998]]
Classification Report for DecisionTree: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    460646
           1       0.80      0.81      0.81      2453

    accuracy                           1.00    463099
   macro avg       0.90      0.91      0.90    463099
weighted avg       1.00      1.00      1.00    463099

#Building XGBoost and calculate recall score
from xgboost.sklearn import XGBClassifier

XGB = XGBClassifier()

# Define the KFold cross-validator
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform KFold cross-validation
scores = cross_val_score(XGB, X_train, y_train, cv=kf, scoring='f1')

# Print the recall score for each fold
for i, score in enumerate(scores, 1):
    print(f"F1 Score for Fold {i}: {score}")

# Fit the model and make predictions
XGB.fit(X_train, y_train)
xgb_pred = XGB.predict(X_test)

print(f"Recall Score for XGBoost: {recall_score(y_test, xgb_pred)}")
print(f"Confusion Matrix for XGBoost: \n{confusion_matrix(y_test, xgb_pred)}")
print(f"Classification Report for XGBoost: \n{classification_report(y_test, xgb_pred)}")

F1 Score for Fold 1: 0.8851126708533432
F1 Score for Fold 2: 0.8901645618063528
F1 Score for Fold 3: 0.8840792369772561
F1 Score for Fold 4: 0.8916184971098265
F1 Score for Fold 5: 0.8874172185430463
Recall Score for XGBoost: 0.8560945780676722
Confusion Matrix for XGBoost: 
[[460530    116]
 [   353   2100]]
Classification Report for XGBoost: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    460646
           1       0.95      0.86      0.90      2453

    accuracy                           1.00    463099
   macro avg       0.97      0.93      0.95    463099
weighted avg       1.00      1.00      1.00    463099

import optuna
from optuna.samplers import TPESampler

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3)

def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1e-6, 500.0, log=True)
    }
    
    model = XGBClassifier(**params, n_jobs=12)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_val)
    recall = recall_score(y_val, preds)
    
    return recall

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_score = study.best_value

print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

Best parameters: {'learning_rate': 0.6999200317633218, 'n_estimators': 871, 'max_depth': 10, 'min_child_weight': 1, 'gamma': 0.8162411486929347, 'colsample_bytree': 0.7976580708126682, 'subsample': 0.221314310258453, 'reg_alpha': 3.126862395569087e-07, 'reg_lambda': 3.5472136483599835e-05, 'scale_pos_weight': 469.1362112154397}
Best score: 1.0

XGB_tuned = XGBClassifier(n_estimators=900, 
                        learning_rate=0.05, 
                        max_depth=10, 
                        min_child_weight=1, 
                        gamma=0.8, 
                        colsample_bytree=0.8, 
                        subsample=0.2, 
                        reg_alpha=0, 
                        reg_lambda=0, 
                        sacle_pos_weight=450,
                        n_jobs=12)

proba_tuned = XGB_tuned.fit(X_train, y_train).predict_proba(X_test)
xgb_tuned_pred = XGB_tuned.predict(X_test)

print('Recall Score = {}'.format(recall_score(y_test, xgb_tuned_pred)))
print(confusion_matrix(y_test, xgb_tuned_pred))
print(classification_report(y_test, xgb_tuned_pred))

Recall Score = 0.9641255605381166
[[460626     20]
 [    88   2365]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    460646
           1       0.99      0.96      0.98      2453

    accuracy                           1.00    463099
   macro avg       1.00      0.98      0.99    463099
weighted avg       1.00      1.00      1.00    463099

from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(y_test, xgb_tuned_pred, labels=XGB_tuned.classes_)

cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=XGB_tuned.classes_)
disp.plot(cmap='viridis')

plt.title("Confusion Matrix - Normalized")
plt.show()

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(fit_data.drop('is_fraud', axis = 1).columns, XGB_tuned.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})
importances = importances.sort_values(by='Importance', ascending=True)

importances = importances.reset_index()

# Create bar chart
plt.figure(figsize=(12, 6))
plt.barh(importances.tail(20)['index'][:20], importances.tail(20)['Importance'])

# Add labels and title
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Important')

# Show plot
plt.show()

Context¶

1. Import packages and checking data¶

1.1. Import packages and data¶

1.2. Checking data type and missing values¶

2. Exploratory Data Analysis¶

2.1. Will fraudulent transactions occur consecutively in a short period of time based on fraud tools?¶

2.2. Do fraudulent transactions occur at specific times of the year, month or day?¶

2.3. Check fraud rate according to categorical features¶

2.4. Create a few new features based on the data and check the distribution¶

3. Feature Engineering, Building and Evaluating Model¶

3.1. One-hot encoding and Dummy encoding¶

3.2. Split Data¶

3.3. Choosing Model¶

3.4. Tuning and Building Final Model¶

Conclusion:¶