import numpy as np
import pandas as pd
from scipy import stats
%matplotlib inline
import datetime as dt

from IPython.display import display, HTML

# Function to display dataframes side by side
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html() + '\xa0\xa0\xa0'
    display(HTML('<div style="display: flex;">' + html_str + '</div>'))

pd.set_option('display.max_column', None)
pd.set_option('display.max_row', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

# Import Data
from config import DATA_PATH

data = pd.read_csv(DATA_PATH, parse_dates=["TransactionDate"])
data["TransactionDate"] = pd.to_datetime(data.TransactionDate).dt.date
data['TransactionDate'] = data['TransactionDate'].apply(pd.to_datetime)

# Exclude data from July (The data only contains data until 1st July)
data = data[data['TransactionDate'] < '2023-07-01']

data.drop(columns=['Unnamed: 0'], inplace=True)

data.info() # there are no missing values
data[['TransactionDate', 'SalesAmount']].describe()

<class 'pandas.core.frame.DataFrame'>
Index: 1397202 entries, 0 to 1397201
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   BillID           1397202 non-null  int64         
 1   Channel          1397202 non-null  object        
 2   OrderFrom        1397202 non-null  object        
 3   TransactionDate  1397202 non-null  datetime64[ns]
 4   SalesAmount      1397202 non-null  float64       
 5   CustomerID       1397202 non-null  int64         
 6   CustomerGender   1397202 non-null  object        
 7   VoucherStatus    1397202 non-null  object        
 8   Province         1397202 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 106.6+ MB

data[data['SalesAmount'] <= 0].head()

display_side_by_side(data[data['CustomerID'] == 1996642][['CustomerID', 'SalesAmount']], 
                        data[data['CustomerID'] == 768491][['CustomerID', 'SalesAmount']])

data = data[data['SalesAmount'] > 0]

data.nunique() # There is no duplicate value in billID

BillID             1397200
Channel                  3
OrderFrom                4
TransactionDate        638
SalesAmount         529684
CustomerID          718050
CustomerGender           3
VoucherStatus            2
Province                 4
dtype: int64

import matplotlib.pyplot as plt
import seaborn as sns

group = ["CustomerGender", "VoucherStatus", "Province", "OrderFrom", "Channel"]

for x in group:
    plt.figure(figsize=(7,4))
    
    # Plotting the average sales amount
    avg_sales = data.groupby(x)['SalesAmount'].mean()
    plt.subplot(1, 2, 1)
    sns.barplot(x=avg_sales.index, y=avg_sales.values, palette='viridis')  # Change the color palette here
    plt.title('Average Sales Amount per ' + x)
    plt.xticks(rotation=90)
    
    # Plotting the total sales amount
    total_sales = data.groupby(x)['SalesAmount'].sum()
    plt.subplot(1, 2, 2)
    sns.barplot(x=total_sales.index, y=total_sales.values, palette='viridis')  # Change the color palette here
    plt.title('Total Sales Amount per ' + x)
    plt.xticks(rotation=90)
    
    plt.tight_layout()
    plt.show()

# Count the number of orders per customer
order_counts = data.groupby('CustomerID')['BillID'].count()

# Filter customers with only one order
customers_with_one_order = order_counts[order_counts == 1]

# Get the count of customers with only one order
number_of_customers_with_one_order = len(customers_with_one_order)

print("Number of customers with only one order:", number_of_customers_with_one_order)

Number of customers with only one order: 495255

from datetime import timedelta
# Find the final date in the 'TransactionDate' column
final_date = data['TransactionDate'].max()

# Calculate the cutoff date for the last 6 months from the final date
cutoff_date = final_date - timedelta(days=180)

# Filter customers who haven't had any orders in the last 6 months from the final date
inactive_customers = data[data['TransactionDate'] < cutoff_date].groupby('CustomerID').size()

# Get the count of inactive customers
number_of_inactive_customers = len(inactive_customers)

print("Number of customers with no orders in the last 6 months from the final date:", number_of_inactive_customers)

Number of customers with no orders in the last 6 months from the final date: 575736

# Sort values ​​to calculate time since last order
data_sorted = data.sort_values(['CustomerID', 'TransactionDate'])

# Lag transaction time
data_sorted['lag'] = data_sorted.groupby('CustomerID')['TransactionDate'].shift(1)

# Convert timedelta to hours
data_sorted['diff'] = (data_sorted['TransactionDate'] - data_sorted['lag'])/np.timedelta64(1, 'D')

# Calculate the average time between orders
average_time_between_orders = data_sorted['diff'].mean()

print("Average time between customer orders:", average_time_between_orders)

Average time between customer orders: 79.64693513951262

# Filter customers who haven't made an order in 2023
inactive_customers_2023 = data[data['TransactionDate'].dt.year == 2023].groupby('CustomerID').size()

# Get the count of inactive customers in 2023
number_of_inactive_customers_2023 = len(inactive_customers_2023)

print("Number of customers who haven't made an order in 2023:", number_of_inactive_customers_2023)

Number of customers who haven't made an order in 2023: 240184

sns.distplot(data_sorted[~data_sorted['diff'].isna()]['diff'])
plt.show()

today = max(data['TransactionDate']) + timedelta(days=1)

# Calculate RFM values
RFM = data.groupby(['CustomerID']).agg({
    'TransactionDate': lambda x: (today - x.max()).days,
    'BillID': 'count',
    'SalesAmount': 'sum'})

# Rename column
RFM.columns = ['Recency', 'Frequency', 'Monetary']

fig, ax = plt.subplots(3, 1, figsize=(8, 12))

# Plot Monetary
sns.kdeplot(data=RFM, x='Monetary', ax=ax[0], fill=True, palette="crest", linewidth=3)
ax[0].set_title('Monetary Distribution')

# Plot Frequency
sns.kdeplot(data=RFM, x='Frequency', ax=ax[1], fill=True, palette="crest", linewidth=3)
ax[1].set_title('Frequency Distribution')

# Plot Recency
sns.kdeplot(data=RFM, x='Recency', ax=ax[2], fill=True, palette="crest", linewidth=3)
ax[2].set_title('Recency Distribution')

plt.tight_layout()
plt.show()

from numpy import sqrt, log10

# Define transformation methods
transformations = {
    'none': lambda x: x,
    'sqrt': sqrt,
    'log10': log10,
    'boxcox': lambda x: stats.boxcox(x)[0]
}

# Calculate and print skewness for each transformation
for name, transform in transformations.items():
    recency_transformed = transform(RFM['Recency'])
    recency_skew = stats.skew(recency_transformed)

    frequency_transformed = transform(RFM['Frequency'])
    frequency_skew = stats.skew(frequency_transformed)

    monetary_transformed = transform(RFM['Monetary'])
    monetary_skew = stats.skew(monetary_transformed)

    print(f"\n{name.capitalize()} transformation:")
    print(f"Skewness of Recency: {recency_skew}")
    print(f"Skewness of Frequency: {frequency_skew}")
    print(f"Skewness of Monetary: {monetary_skew}")

None transformation:
Skewness of Recency: 0.04356166369846459
Skewness of Frequency: 12.369499427471565
Skewness of Monetary: 18.954224583976302

Sqrt transformation:
Skewness of Recency: -0.49097751362347036
Skewness of Frequency: 3.4505122033956948
Skewness of Monetary: 2.83530947499132

Log10 transformation:
Skewness of Recency: -1.632735789608205
Skewness of Frequency: 1.8799794010067354
Skewness of Monetary: 0.6165928123199858

Boxcox transformation:
Skewness of Recency: -0.3102933245609721
Skewness of Frequency: 0.8771981678957986
Skewness of Monetary: -0.01735377665060057

# Preview Skewness of data
sns.displot(RFM['Recency'])
sns.displot(stats.boxcox(RFM['Frequency'])[0])
sns.displot(stats.boxcox(RFM['Monetary'])[0])
plt.show()

from sklearn.preprocessing import MinMaxScaler

# Box-cox Transformation
RFM['Frequency_boxcox'], _ = stats.boxcox(RFM['Frequency'])
RFM['Monetary_boxcox'], _ = stats.boxcox(RFM['Monetary'])

# Data for transformation
fit_RFM = RFM[['Recency', 'Frequency_boxcox', 'Monetary_boxcox']]
fit_RFM.columns = ['Recency', 'Frequency', 'Monetary']

# Scale Data
scaler = MinMaxScaler()
scaler.fit(fit_RFM)
transformed_RFM = scaler.transform(fit_RFM)

import logging
from sklearn.cluster import KMeans

# Suppress warnings
logging.getLogger('matplotlib.font_manager').disabled = True

from yellowbrick.cluster import KElbowVisualizer
Elbow_M = KElbowVisualizer(KMeans(), k=10, metric='distortion')
Elbow_M.fit(transformed_RFM)
Elbow_M.show() # 4 cluster is the optimize number

<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

from sklearn.metrics import silhouette_score

# Fitting KMeans Model
model = KMeans(n_clusters=4, init='k-means++', random_state=3, n_init=30)
model.fit(transformed_RFM)

labeled_RFM = pd.DataFrame(transformed_RFM, columns=['Recency', 'Frequency', 'Monetary'], index=fit_RFM.index)
labeled_RFM['Cluster'] = model.predict(labeled_RFM)

# Evaluate model
silhouette = silhouette_score(labeled_RFM[['Recency', 'Frequency', 'Monetary']], labeled_RFM['Cluster'], sample_size=40000)

print("Silhouette Coefficient:", silhouette) # The silhouette score is quite good

Silhouette Coefficient: 0.5688068708100638

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(labeled_RFM['Recency'], 
                    labeled_RFM['Frequency'], 
                    labeled_RFM['Monetary'], 
                    c=labeled_RFM['Cluster'].astype(int), 
                    cmap='viridis',
                    alpha=0.7)

ax.set_title('Clusters', fontsize=16, pad=20)
ax.set_xlabel('Recency', fontsize=14, labelpad=10)
ax.set_ylabel('Frequency', fontsize=14, labelpad=10)
ax.set_zlabel('Monetary', fontsize=14, labelpad=10)

ax.view_init(elev=20, azim=120)

cbar = plt.colorbar(scatter, shrink=0.5)
cbar.set_label('Cluster Number', rotation=270, labelpad=15)

ax.tick_params(axis='both', which='major', labelsize=12)

plt.show()

# Create new df
transformed_RFM_df = pd.DataFrame(transformed_RFM,
                                    index=RFM.index,
                                    columns=RFM[['Recency', 'Frequency', 'Monetary']].columns)
transformed_RFM_df['Cluster'] = labeled_RFM.Cluster

# Create long table
RFM_melt = pd.melt(transformed_RFM_df.reset_index(), 
                    id_vars=['CustomerID', 'Cluster'],
                    value_vars=['Recency', 'Frequency', 'Monetary'],
                    var_name='Attribute',
                    value_name='Value')
plt.title("Snake plot")
sns.lineplot(x='Attribute', y='Value', hue='Cluster', data=RFM_melt)
plt.show()

# Assign Cluster number
RFM['Cluster'] = labeled_RFM.Cluster

# Calculate Average RFM Values of each Cluster
print(RFM.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean())

         Recency  Frequency    Monetary
Cluster                                
0        162.068      1.000  307799.016
1         97.538      4.851 1565071.450
2        486.570      1.000  289320.191
3        394.498      2.704  816652.064

# Building Relative Importance Heatmap
cluster_avg = RFM.groupby(['Cluster']).agg({'Recency':'mean',
                                            'Frequency':'mean',
                                            'Monetary':'mean'})
population_avg = RFM[['Recency','Frequency','Monetary']].mean()
relative_imp = cluster_avg/population_avg - 1
plt.figure(figsize=(8,2))
plt.title('Relative Importance of Attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn')
plt.show()

# Import Model
from lifetimes import *
from lifetimes.utils import *
from lifetimes.plotting import *

# Create data and auto calculate RFM Score (Formula of the BetaGeo Model is different than the way that we calculated above)
churn_RFM = summary_data_from_transaction_data(data.reset_index(drop=True), 'CustomerID' , 'TransactionDate' , 'SalesAmount' )

bgf = BetaGeoFitter(penalizer_coef=0.0) 
bgf.fit(churn_RFM['frequency'], churn_RFM['recency'], churn_RFM['T'])
t = 180 # Forecast  for the next 180 days
churn_RFM['expected_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, churn_RFM['frequency'], churn_RFM['recency'], churn_RFM['T'])

# Alive Probability of Customers
churn_RFM['Retention'] = bgf.conditional_probability_alive(churn_RFM['frequency'], churn_RFM['recency'], churn_RFM['T'])
churn_RFM['Churn'] = 1 - churn_RFM['Retention']

plot_period_transactions(bgf).set_yscale('log')

# Split Calibration and Holdout data
summary_cal_holdout = calibration_and_holdout_data(data.reset_index(drop=True), 'CustomerID', 'TransactionDate',
                                        calibration_period_end='2023-01-01',
                                        observation_period_end='2023-06-30',
                                        monetary_value_col='SalesAmount')
summary_cal_holdout = summary_cal_holdout[summary_cal_holdout['frequency_cal'] > 0]

# Add Frequency column to data
data.index = data['CustomerID']
data['Frequency'] = RFM.Frequency

# Plot Cumulative transaction of Model vs Actual data
bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_cumulative_transactions(bgf, data[data['Frequency'] > 1].reset_index(drop=True), 'TransactionDate', 'CustomerID', 637, 456)

<Axes: title={'center': 'Tracking Cumulative Transactions'}, xlabel='day', ylabel='Cumulative Transactions'>

plot_incremental_transactions(bgf, data[data['Frequency'] > 1].reset_index(drop=True), 'TransactionDate', 'CustomerID', 637, 456)

<Axes: title={'center': 'Tracking Daily Transactions'}, xlabel='day', ylabel='Transactions'>

plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

<Axes: title={'center': 'Actual Purchases in Holdout Period vs Predicted Purchases'}, xlabel='Purchases in calibration period', ylabel='Average of Purchases in Holdout Period'>

from sklearn.metrics import mean_absolute_error

predicted_purchases = bgf.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                            summary_cal_holdout['frequency_cal'], 
                                                                            summary_cal_holdout['recency_cal'], 
                                                                            summary_cal_holdout['T_cal'])
actual_purchases = summary_cal_holdout['frequency_holdout']
mae_frequency = mean_absolute_error(actual_purchases, predicted_purchases)
print(f'Mean Absolute Error: {mae_frequency}')

Mean Absolute Error: 0.8605611140417139

# Filter out Frequency=1 Customers
repeat_customers = churn_RFM[churn_RFM['frequency'] > 0]
repeat_customers['Cluster'] = labeled_RFM.Cluster

# Plot alive probability
repeat_customers['prob_alive'] = bgf.conditional_probability_alive(repeat_customers['frequency'], repeat_customers['recency'], repeat_customers['T'])
sns.distplot(repeat_customers['prob_alive'])
plt.show()

print(repeat_customers.groupby("Cluster")["prob_alive"].mean())

Cluster
1   0.608
3   0.101
Name: prob_alive, dtype: float64

repeat_customers[['frequency', 'monetary_value']].corr()

from scipy.stats import gamma

repeat_customers['monetary_value_boxcox'], _ = stats.boxcox(repeat_customers['monetary_value'])

# Fit a gamma distribution to monetary values
ag, loc, scale = gamma.fit(repeat_customers['monetary_value_boxcox'])

# Generate values from fitted gamma distribution
gamma_fitted = gamma.pdf(np.linspace(min(repeat_customers['monetary_value_boxcox']), max(repeat_customers['monetary_value_boxcox']), 100), ag, loc, scale)

# Plot histogram and fitted gamma PDF
plt.figure()
plt.hist(repeat_customers['monetary_value_boxcox'], bins=30, density=True, alpha=0.75, label='Observed data')
plt.plot(np.linspace(min(repeat_customers['monetary_value_boxcox']), max(repeat_customers['monetary_value_boxcox']), 100), gamma_fitted, 'r-', label='Gamma fit')
plt.xlabel('Monetary Value')
plt.ylabel('Density')
plt.title('Fit of Gamma Distribution')
plt.legend()
plt.show()

ggf = GammaGammaFitter(penalizer_coef=0.0)

summary_cal_holdout['monetary_value_cal_boxcox'], mon_val_lmbda = stats.boxcox(summary_cal_holdout['monetary_value_cal'])

ggf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['monetary_value_cal_boxcox'])

<lifetimes.GammaGammaFitter: fitted with 166916 subjects, p: 525.60, q: 540.76, v: 16.96>

# Predict the expected number of transactions in the next 180 days since 2023-01-01
days = (dt.datetime(2023, 6, 30) - dt.datetime(2023, 1, 1)).days

predicted_bgf = bgf.predict(days,
                        summary_cal_holdout['frequency_cal'], 
                        summary_cal_holdout['recency_cal'], 
                        summary_cal_holdout['T_cal'])

# Predict the average order value
monetary_pred_boxcox = ggf.conditional_expected_average_profit(
    summary_cal_holdout['frequency_cal'],
    summary_cal_holdout['monetary_value_cal_boxcox'])

# Inverse Box-Cox transformation to map predictions back to the original scale
def inverse_boxcox(y, lambda_):
    if lambda_ == 0:
        return np.exp(y)
    else:
        return (y * lambda_ + 1) ** (1 / lambda_)
    
monetary_pred = inverse_boxcox(monetary_pred_boxcox, mon_val_lmbda)

# Calculate the predicted sales
sales_pred = predicted_bgf * monetary_pred

# Actual values calculation
actual = summary_cal_holdout['monetary_value_holdout'] * summary_cal_holdout['frequency_holdout']

import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

# Calculate the mean of the training set
train_actual = summary_cal_holdout['monetary_value_cal'] * summary_cal_holdout['frequency_cal']
median_sales = train_actual.median()

def evaluate(actual, sales_prediction, median_sales):
    actual_log = np.log1p(actual)  # Log-transforming the actual values
    sales_pred_log = np.log1p(sales_prediction)  # Log-transforming the predictions
    
    print(f"Total Sales Actual: {np.round(actual.sum())}")
    print(f"Total Sales Predicted: {np.round(sales_prediction.sum())}")
    
    # Calculate and print the percentage difference
    percentage_difference = ((sales_prediction.sum() - actual.sum()) / actual.sum()) * 100
    print(f"Percentage Difference between Actual Sales and Predicted Sales: {percentage_difference:.2f}%")
    
    print(f"Individual Mean Absolute Error: {mean_absolute_error(actual, sales_prediction)}")

    # Predict the mean sales for all data points
    naive_predictions = np.full(shape=actual.shape, fill_value=median_sales)

    # Calculate the MAE of the naive model
    naive_mae = mean_absolute_error(actual, naive_predictions)
    print(f"Naive Mean Absolute Error: {naive_mae}")
    
    plt.scatter(sales_pred_log, actual_log, alpha=0.5)
    plt.xlabel('Log of Prediction')
    plt.ylabel('Log of Actual')
    plt.title('Log-Log Plot of Actual vs. Prediction')
    plt.show()

# Apply the function with actual and predicted values
evaluate(actual, sales_pred, median_sales)

Total Sales Actual: 47080105051.0
Total Sales Predicted: 45781350436.0
Percentage Difference between Actual Sales and Predicted Sales: -2.76%
Individual Mean Absolute Error: 291539.45987536316
Naive Mean Absolute Error: 469809.1762456828

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Prepare the features and target
X = summary_cal_holdout[['frequency_cal', 'recency_cal', 'T_cal', 'monetary_value_cal']]
y = summary_cal_holdout['monetary_value_holdout'] * summary_cal_holdout['frequency_holdout']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Fit the model
xgb.fit(X_train, y_train)

# Make predictions
xgb_predictions = xgb.predict(X_test)

# Calculate the MAE of the XGBRegressor
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBRegressor Mean Absolute Error: {xgb_mae}")

XGBRegressor Mean Absolute Error: 302864.0472673454

# Data for fitting model
temp = summary_data_from_transaction_data(data.reset_index(drop=True), 'CustomerID', 'TransactionDate', monetary_value_col='SalesAmount')
cluster_1_3 = temp.loc[temp.frequency > 0, :]

# BG/NBD - Churn Probability
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(cluster_1_3['frequency'], cluster_1_3['recency'], cluster_1_3['T'])
cluster_1_3['Churn'] = 1 - bgf.conditional_probability_alive(cluster_1_3['frequency'], cluster_1_3['recency'], cluster_1_3['T'])

# Gamma-Gamma
ggf = GammaGammaFitter(penalizer_coef=0.0)

cluster_1_3['monetary_value_boxcox'], monetary_lmbda = stats.boxcox(cluster_1_3['monetary_value'])

# Fitting ggf model
ggf.fit(cluster_1_3['frequency'],
        cluster_1_3['monetary_value_boxcox'])

# Avg Monetary predict
expected_avg_profit = ggf.conditional_expected_average_profit(
    cluster_1_3['frequency'],
    cluster_1_3['monetary_value_boxcox']
)

expected_avg_profit_true = inverse_boxcox(expected_avg_profit, monetary_lmbda)

# Avg frequency predict
expected_transactions = bgf.predict(
    180, # 6 months
    cluster_1_3['frequency'],
    cluster_1_3['recency'],
    cluster_1_3['T']
)

cluster_1_3['clv'] = expected_transactions * expected_avg_profit_true

cluster_1_3['Cluster'] = labeled_RFM.Cluster
print(cluster_1_3.groupby("Cluster")["Churn"].mean())
print(cluster_1_3.groupby("Cluster")["clv"].mean())

Cluster
1   0.364
3   0.862
Name: Churn, dtype: float64
Cluster
1   355426.359
3    38039.774
Name: clv, dtype: float64

data_sorted.index = data_sorted['CustomerID']
data_sorted['Cluster'] = RFM.Cluster
average_diff = data_sorted[(data_sorted['Cluster'] == 1) | (data_sorted['Cluster'] == 3)].reset_index(drop=True).groupby('CustomerID')['diff'].mean()
average_diff.columns = ['average_diff']
count, bins_count = np.histogram(average_diff, bins=20) 

# using numpy np.cumsum to calculate the CDF 
pdf = count / sum(count) 
cdf = np.cumsum(pdf) 

# plotting PDF and CDF 
plt.plot(bins_count[1:], cdf, label="CDF")
plt.xlabel("Days")
plt.ylabel("Percentage of Customers Buying Again")
plt.legend()

<matplotlib.legend.Legend at 0x7fc4cff3c2f0>

#Filter for cluster 0 and 2
cluster_0_2 = RFM[(RFM['Cluster'] == 0) | (RFM['Cluster'] == 2)]

#Define threshold for defining churn
churn_threshold = average_diff.reset_index()['diff'].quantile(0.93)

#Probability of churn
cluster_0_2['Churn_1'] = cluster_0_2['Recency']/churn_threshold
cluster_0_2['Churn'] = np.where(cluster_0_2['Churn_1']>1, 1, cluster_0_2['Churn_1'])

cluster_0_2.groupby('Cluster')['Churn'].mean()

Cluster
0   0.517
2   1.000
Name: Churn, dtype: float64

# Calculate cutoff date for target variable
n_days = 180
max_date = data['TransactionDate'].max()
cutoff = max_date - pd.to_timedelta(n_days, unit='d')

# Split data
temporal_in_df = data[data['TransactionDate'] <= cutoff]
temporal_out_df = data[data['TransactionDate'] > cutoff]

# Making target data
targets_df = temporal_out_df[['CustomerID', 'SalesAmount']].reset_index(drop=True).\
    groupby('CustomerID').sum().rename({'SalesAmount':'spend_180_total'}, axis=1).assign(spend_180_flag=1)

# RFM
max_day = temporal_in_df['TransactionDate'].max() + timedelta(days=1)

# Calculate RFM values
features_df = temporal_in_df.reset_index(drop=True).groupby(['CustomerID']).agg({
    'TransactionDate': lambda x: (max_day - x.max())/pd.to_timedelta(1,"day"),
    'BillID': 'count',
    'SalesAmount': 'sum'}).merge(targets_df, left_index=True, right_index=True, how='left').fillna(0)

features_df['Cluster'] = RFM.Cluster

# Rename column
features_df.rename(columns={'TransactionDate': 'Recency', 'BillID': 'Frequency', 'SalesAmount': 'Monetary'}, inplace=True)

# MACHINE LEARNING FOR REGRESSION
X = features_df[['Recency', 'Frequency', 'Monetary']]
y_spend = features_df['spend_180_total']

##Train and test split
from sklearn.model_selection import KFold, cross_val_score
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_spend, test_size=0.2, random_state=3)

#Create XGB Regression model and Tuning Model
xgb_reg = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=2, n_jobs=12)

kf = KFold(n_splits=5, random_state=1, shuffle=True)

cv_scores = cross_val_score(xgb_reg, X_train_reg, y_train_reg, cv=kf, scoring='neg_mean_absolute_error')

# Fit the model
xgb_reg.fit(X_train_reg, y_train_reg)

# Print the cross-validation scores
print('Cross-validation scores:', cv_scores)

#Predict X
predictions_reg = xgb_reg.predict(X_test_reg)
predictions_reg = np.where(predictions_reg < 0, 0, predictions_reg)

#Mean Absolute Error
print('Mean Absolute Error of XGB Regression Model: {}'.format(mean_absolute_error(y_test_reg, predictions_reg)))

Cross-validation scores: [-141070.05543361 -141681.34271143 -143925.69511609 -139769.06362863
 -144515.03960475]
Mean Absolute Error of XGB Regression Model: 141027.53955605143

import optuna
from optuna.samplers import TPESampler

def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1e-6, 500.0, log=True)
    }
    
    model = XGBRegressor(**params, n_jobs=12)
    model.fit(X_train_reg, y_train_reg)
    
    # Use the validation set for evaluation
    preds = model.predict(X_test_reg)
    mae = mean_absolute_error(y_test_reg, preds)
    
    return -mae  # return negative MAE for maximization

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_score = study.best_value

print(f"Best parameters: {best_params}")
print(f"Best negative MAE score: {best_score}")

Best parameters: {'learning_rate': 0.8044705559265699, 'n_estimators': 343, 'max_depth': 1, 'min_child_weight': 8, 'gamma': 0.5212677432318678, 'colsample_bytree': 0.46482910817480444, 'subsample': 0.10165275993209091, 'reg_alpha': 1.412096705991424e-06, 'reg_lambda': 0.8675148598594582, 'scale_pos_weight': 3.3352054545636855e-05}
Best negative MAE score: -140164.43003257317

# FINAL MODEL AND CLUSTER 0 CLV
# Fit model
XGB_reg_tuned = XGBRegressor(reg_lambda=16, min_child_weight=12, max_depth=10, learning_rate=0.25, random_state=3)
XGB_reg_tuned.fit(X, y_spend)

# Predict
clv_predict = XGB_reg_tuned.predict(RFM[RFM['Cluster'] == 0][['Recency', 'Frequency', 'Monetary']])
clv_predict_tune = np.where(clv_predict < 0, 0, clv_predict)  # Setting CLV = 0 for negative values

result_df = pd.concat([pd.DataFrame(clv_predict_tune).set_axis(['clv'], axis=1),
                        RFM[RFM['Cluster'] == 0][['Recency', 'Frequency', 'Monetary']].reset_index()], 
                        axis=1)
result_df.index = result_df['CustomerID']
result_df['Cluster'] = RFM.Cluster
cluster_0_clv = result_df.groupby('Cluster')['clv'].mean()
cluster_0_clv.head()

Cluster
0   50440.219
Name: clv, dtype: float32

# Concat 2 df
cluster_0_2 = cluster_0_2.merge(result_df, left_index=True, right_index=True, how='left', suffixes=('', '_x'))
cluster_0_2 = cluster_0_2[['Frequency', 'Recency', 'Monetary', 'Churn', 'clv', 'Cluster']].fillna(0)
cluster_1_3.rename(columns={'frequency': 'Frequency', 'recency': 'Recency', 'monetary_value': 'Monetary'}, inplace=True)
cluster_1_3 = cluster_1_3.drop(['T'], axis=1)
final_clv = pd.concat([cluster_0_2, cluster_1_3], axis=0)
final_clv.sort_index(ascending=True, inplace=True)

print("Total CLV of each Cluster: ", final_clv.groupby('Cluster')['clv'].sum()) # Total CLV
print("Average CLV of each Cluster: ", final_clv.groupby('Cluster')['clv'].mean()) # Average CLV
print("Churn Probability of each Cluster: ", final_clv.groupby('Cluster')['Churn'].mean()) # Average Churn Probability

Total CLV of each Cluster:  Cluster
0   10771205926.307
1   49285196011.201
2             0.000
3    3115913941.070
Name: clv, dtype: float64
Average CLV of each Cluster:  Cluster
0    50440.218
1   355426.359
2        0.000
3    38039.774
Name: clv, dtype: float64
Churn Probability of each Cluster:  Cluster
0   0.517
1   0.364
2   1.000
3   0.862
Name: Churn, dtype: float64

# Get the first month of each Customer
data = data.reset_index(drop=True)
def get_month(x): 
    return dt.datetime(x.year, x.month, 1)

data['transaction_month'] = data['TransactionDate'].apply(get_month)
first_month = data.groupby("CustomerID").agg({'transaction_month': 'min'}).reset_index().rename(columns={'transaction_month': 'first_month'})
cohort = data.merge(first_month, how='left', on='CustomerID')

# Calculate the months since Customer's first transaction month
def month_diff(a, b):
    return 12*(a.dt.year - b.dt.year) + (a.dt.month - b.dt.month)

cohort['diff'] = month_diff(cohort.transaction_month, cohort.first_month)
cohort_pivot = cohort.groupby(['first_month', 'diff'])['CustomerID'].nunique().reset_index()\
                    .pivot_table(values="CustomerID", index="first_month", columns="diff")

# Draw the heatmap of MoM retention rate
mom_retention_rate = cohort_pivot.divide(cohort_pivot.iloc[:, 0], axis=0).round(3) * 100

plt.figure(figsize=(18,14))
plt.title("MoM Retention Rate for Customer Transaction Data")

ax = sns.heatmap(data=mom_retention_rate, annot=True, vmin=0, vmax=10, cmap='crest', fmt=".1f")
ax = ax.set_yticklabels(mom_retention_rate.index.strftime('%Y-%m-%d'))

plt.xlabel('Months since first Order')
plt.ylabel('Month')
plt.show()

#Draw the number of new customer by month overtime
plt.figure(figsize=(12, 6))
sns.lineplot(cohort.groupby('first_month')['CustomerID'].nunique())
plt.xlabel('Month')
plt.ylabel('Number of Customer')
plt.show()

	TransactionDate	SalesAmount
count	1397202	1397202.000
mean	2022-08-01 12:10:43.237555968	310330.403
min	2021-10-01 00:00:00	-31605.000
25%	2022-02-22 00:00:00	178089.000
50%	2022-07-21 00:00:00	261690.000
75%	2023-01-09 00:00:00	364380.750
max	2023-06-30 00:00:00	57731681.000
std	NaN	317534.538

	BillID	Channel	OrderFrom	TransactionDate	SalesAmount	CustomerID	CustomerGender	VoucherStatus	Province
1052712	1052712	Take Away	STORE	2023-01-12	-31605.000	1996642	Female	No	Nothern Provinces
1212340	1212340	Take Away	STORE	2023-03-29	-1104.000	768491	Male	No	Hanoi

	CustomerID	SalesAmount
289260	1996642	255165.000
1048367	1996642	3821364.000
1052712	1996642	-31605.000

	CustomerID	SalesAmount
14910	768491	480696.000
258382	768491	527982.000
359281	768491	395003.000
515940	768491	390227.000
658327	768491	450617.000
790788	768491	483823.000
1133172	768491	666475.000
1186516	768491	232791.000
1212096	768491	279389.000
1212340	768491	-1104.000
1233398	768491	242667.000

BUSINESS BACKGROUND¶

OUTLINE¶

1. Import Data, EDA and RFM Segmentation¶

Import Data¶

EDA¶

Other Conclusion (From analysis in PowerBI):¶

RFM Segmentation¶

2. Churn and CLV Predict¶

A. Churn Risk Modeling¶

1. Calibration Period Histogram¶

2. Cumulative Transaction Plot¶

3. Incremental Transaction Plot¶

4. Conditional Expectations Plot¶

5. Alive Probability¶

B. CLV Model¶

3. Final Model for CLV and Churn Probability¶

Churn Probability and CLV Model for Cluster 1 and 3¶

Churn Probability and CLV Model for Cluster 0 and 2¶

4. Additional Insights¶

Retention Rate¶

New Customer by Month¶

RECOMMENDATIONS¶

Cluster 0¶

Cluster 1¶

Cluster 2¶

Cluster 3¶

General Recommendations¶

	frequency	monetary_value
frequency	1.000	-0.012
monetary_value	-0.012	1.000