import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from ydata_profiling import ProfileReport


# Load data
raw_payments_df = pd.read_csv('Data/payments.csv', parse_dates = ['date'])
raw_merchants_df = pd.read_csv('Data/merchants.csv')

print(raw_payments_df.info())
print(raw_merchants_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1577887 entries, 0 to 1577886
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype              
---  ------               --------------    -----              
 0   date                 1577887 non-null  datetime64[ns, UTC]
 1   merchant             1577887 non-null  object             
 2   subscription_volume  1577887 non-null  int64              
 3   checkout_volume      1577887 non-null  int64              
 4   payment_link_volume  1577887 non-null  int64              
 5   total_volume         1577887 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(4), object(1)
memory usage: 72.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23627 entries, 0 to 23626
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   merchant           23627 non-null  object
 1   industry           23627 non-null  object
 2   first_charge_date  23627 non-null  object
 3   country            23627 non-null  object
 4   business_size      23627 non-null  object
dtypes: object(5)
memory usage: 923.1+ KB
None


# Describe merchants
print(raw_merchants_df.head())
raw_merchants_df.describe()

   merchant           industry          first_charge_date country  \
0  5d03e714          Education  2032-02-13 00:00:00+00:00      US   
1  da22f154             Others  2031-10-16 00:00:00+00:00      US   
2  687eebc8           Software  2032-07-23 00:00:00+00:00      US   
3  de478470           Software  2033-03-15 00:00:00+00:00      US   
4  1e719b8a  Business services  2035-02-12 00:00:00+00:00      IT   

  business_size  
0        medium  
1         small  
2         small  
3         small  
4         small


# Check unique values and their counts for categorical columns
industry_counts = raw_merchants_df['industry'].value_counts()
country_counts = raw_merchants_df['country'].value_counts()
business_size_counts = raw_merchants_df['business_size'].value_counts()

industry_counts, country_counts, business_size_counts

# Note: industry has value of "0", business_size has class imbalance, country has a few countries with very low counts

(industry
 Business services                                           3144
 Software                                                    2167
 Merchandise                                                 1988
 Personal services                                           1922
 Others                                                      1732
 Digital goods                                               1379
 Education                                                   1332
 Clothing & accessory                                        1303
 Food & drink                                                1275
 Grocery & food stores                                        780
 Religion, politics & other memberships                       762
 Travel & lodging                                             727
 Leisure                                                      554
 Art & photography                                            508
 Charity                                                      502
 Medical services, drugs, testing labs & equipment medium     435
 Ticketing & events                                           425
 Furnishing                                                   419
 Cosmetics                                                    399
 Specialty retail                                             384
 Rentals                                                      350
 Automotive parts and repair/service shops                    258
 Construction                                                 221
 Healthcare                                                   204
 Electronics                                                  194
 Transportation & car rental                                  189
 Consulting services                                           50
 Insurance                                                     14
 0                                                              7
 Money transmitters                                             3
 Name: count, dtype: int64,
 country
 US    8595
 GB    3478
 FR    1700
 AU    1378
 CA    1324
 ES     926
 IT     821
 JP     580
 DE     507
 IE     428
 MX     353
 IN     333
 CH     302
 NZ     248
 HK     208
 SG     201
 NL     196
 BE     184
 BR     156
 SE     151
 AT     143
 MY     127
 NO     123
 PT     118
 AE     115
 DK     114
 PL     103
 RO      94
 SI      70
 CZ      67
 EE      65
 SK      60
 BG      52
 LT      51
 FI      50
 GR      44
 LV      43
 HU      37
 LU      27
 CY      26
 MT      20
 HR       7
 TH       1
 GI       1
 Name: count, dtype: int64,
 business_size
 small     22853
 medium      622
 large       152
 Name: count, dtype: int64)


# Describe payments
print(raw_payments_df.head())
raw_payments_df.describe(include = 'all')

                       date  merchant  subscription_volume  checkout_volume  \
0 2041-05-01 00:00:00+00:00  5d03e714                    0                0   
1 2041-05-01 00:00:00+00:00  da22f154                    0                0   
2 2041-05-01 00:00:00+00:00  687eebc8                79400                0   
3 2041-05-01 00:00:00+00:00  de478470               268400                0   
4 2041-05-01 00:00:00+00:00  1e719b8a                    0            19895   

   payment_link_volume  total_volume  
0                    0        425340  
1                    0         17326  
2                    0         79400  
3                    0        268400  
4                    0         19895


# Generate corr plot of payment volume

volume_columns = [col for col in raw_payments_df.columns if col.endswith('_volume')]
volume_df = raw_payments_df[volume_columns]
correlation = volume_df.corr()

sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Plot of Volume Fields')
plt.show()


# Box and whisker plots
fig, axs = plt.subplots(1, len(volume_columns), figsize=(15, 5))

for i, column in enumerate(volume_columns):
    sns.boxplot(x=raw_payments_df[column], ax=axs[i])
    axs[i].set_title(column)

plt.show()


# Clean payments to parse dates
raw_payments_df['date'] = pd.to_datetime(raw_payments_df['date'])
# Convert Cents to Dollars for all columns ending in _volume
volume_columns = [col for col in raw_payments_df.columns if col.endswith('_volume')]
raw_payments_df[volume_columns] = raw_payments_df[volume_columns] / 100

payments_df = raw_payments_df.copy()
payments_df.dtypes

date                   datetime64[ns, UTC]
merchant                            object
subscription_volume                float64
checkout_volume                    float64
payment_link_volume                float64
total_volume                       float64
dtype: object


# Review & handle duplicate merchant
print(raw_merchants_df[raw_merchants_df.duplicated(subset = ['merchant'])])
dups_row = raw_merchants_df.duplicated(subset = ['merchant'], keep = False)
print(raw_merchants_df[dups_row])
# Remove any duplicate merchant id
raw_merchants_df = raw_merchants_df[~dups_row]

# Drop first charge dates of 0 from data and parse dates
no_charge_date_mask = raw_merchants_df['first_charge_date'] == "0"
raw_merchants_df = raw_merchants_df[~no_charge_date_mask]
raw_merchants_df['first_charge_date'] = pd.to_datetime(raw_merchants_df['first_charge_date'])

# Drop rows with industry 0
raw_merchants_df = raw_merchants_df[raw_merchants_df['industry'] != "0"]

merchants_df = raw_merchants_df.copy()
print(merchants_df.dtypes)
print(merchants_df.shape)

       merchant           industry          first_charge_date country  \
11606   0.00E+0        Merchandise  2038-11-30 00:00:00+00:00      CA   
13020   0.00E+0          Education  2041-07-15 00:00:00+00:00      MX   
13304  4.72E+10          Education  2041-01-20 00:00:00+00:00      IT   
13384   0.00E+0             Others  2041-07-24 00:00:00+00:00      GB   
18458   0.00E+0  Business services  2041-12-16 00:00:00+00:00      US   
20739   0.00E+0             Others  2042-03-10 00:00:00+00:00      US   
21151   0.00E+0             Others  2042-03-26 00:00:00+00:00      GB   
22686   0.00E+0            Leisure  2042-05-22 00:00:00+00:00      SG   

      business_size  
11606         small  
13020         small  
13304         small  
13384         small  
18458         small  
20739         small  
21151         small  
22686         small  
       merchant              industry          first_charge_date country  \
11282   0.00E+0            Healthcare  2040-12-08 00:00:00+00:00      US   
11606   0.00E+0           Merchandise  2038-11-30 00:00:00+00:00      CA   
12221  4.72E+10  Clothing & accessory  2037-08-01 00:00:00+00:00      US   
13020   0.00E+0             Education  2041-07-15 00:00:00+00:00      MX   
13304  4.72E+10             Education  2041-01-20 00:00:00+00:00      IT   
13384   0.00E+0                Others  2041-07-24 00:00:00+00:00      GB   
18458   0.00E+0     Business services  2041-12-16 00:00:00+00:00      US   
20739   0.00E+0                Others  2042-03-10 00:00:00+00:00      US   
21151   0.00E+0                Others  2042-03-26 00:00:00+00:00      GB   
22686   0.00E+0               Leisure  2042-05-22 00:00:00+00:00      SG   

      business_size  
11282         small  
11606         small  
12221         small  
13020         small  
13304         small  
13384         small  
18458         small  
20739         small  
21151         small  
22686         small  
merchant                          object
industry                          object
first_charge_date    datetime64[ns, UTC]
country                           object
business_size                     object
dtype: object
(23584, 5)


# Create last_date reference based on payments data
last_date = payments_df['date'].max()
print(last_date)

# Create new column called merchant_age_mos
merchants_df['merchant_age_mos'] = (last_date - merchants_df['first_charge_date']).dt.days // 30.44
#drop merchant_age column
merchants_df.head()

2042-06-22 00:00:00+00:00


# Define conditions for np.select
conditions = [
    (merchants_df['merchant_age_mos'] <= 6),
    (merchants_df['merchant_age_mos'] > 6) & (merchants_df['merchant_age_mos'] <= 24),
    (merchants_df['merchant_age_mos'] > 24) & (merchants_df['merchant_age_mos'] <= 60),
    (merchants_df['merchant_age_mos'] > 60)
]

# Define corresponding labels
choices = ['New', 'Established', 'Mature', 'Legacy']
# Use np.select to create the 'merchant_age_group' column
merchants_df['merchant_age_group'] = np.select(conditions, choices, default='Unknown')



# Merchant age in months histogram
plt.figure(figsize=(10, 5))
sns.histplot(merchants_df['merchant_age_mos'], bins=50, kde=True)
plt.title('Merchant Age in Months')
plt.xlabel('Merchant Age (Months)')
plt.ylabel('Frequency')
plt.show()

# Merchant age group plot
merchants_df['merchant_age_group'].value_counts().plot(kind='bar')
plt.title('Counts of Merchant Age Group')
plt.xlabel('Merchant Age Group')
plt.ylabel('Count')
plt.show()

# Drop merchant age column
merchants_df = merchants_df.drop(columns = ['first_charge_date','merchant_age_mos'], axis = 1)
merchants_df.head()


# Merge and evaluate data

distinct_merchants = payments_df['merchant'].nunique()
print("Number of distinct merchants in payments:", distinct_merchants)
print(payments_df.shape)

distinct_merchants = merchants_df['merchant'].nunique()
print("Number of distinct merchants in merchants:", distinct_merchants)
print(merchants_df.shape)

# Merge payments and merchants data using inner join
payments_merchants_df = payments_df.merge(merchants_df, on='merchant', how='inner')
print("number of distinct merchants in merged data:", payments_merchants_df['merchant'].nunique())
print(payments_merchants_df.shape)

# Merchants in payments_df but not in merchants_df
missing_in_merchants = payments_df[~payments_df['merchant'].isin(merchants_df['merchant'])]
# Merchants in merchants_df but not in payments_df
missing_in_payments = merchants_df[~merchants_df['merchant'].isin(payments_df['merchant'])]
print("Merchants in payments_df but not in merchants_df:", missing_in_merchants['merchant'].nunique())
print("Merchants in merchants_df but not in payments_df:", missing_in_payments['merchant'].nunique())

Number of distinct merchants in payments: 23619
(1577887, 6)
Number of distinct merchants in merchants: 23584
(23584, 5)
number of distinct merchants in merged data: 23557
(1574955, 10)
Merchants in payments_df but not in merchants_df: 62
Merchants in merchants_df but not in payments_df: 27


payments_merchants_df.groupby('business_size')['total_volume'].mean().plot(kind='bar', color='#7356BF')
plt.title('Average Merchant-Day Total_Volume by Business Size')
plt.xlabel('Business Size')
plt.ylabel('Average Total Volume (USD)')
plt.show()

payments_merchants_df.groupby('business_size')['total_volume'].median().plot(kind='bar', color='#7356BF')
plt.title('Median Merchant-Day Total_Volume by Business Size')
plt.xlabel('Business Size')
plt.ylabel('Median Total Volume (USD)')
plt.show()

payments_merchants_df['business_size'].value_counts().plot(kind='bar', color='#7356BF')
plt.title('Merchant-Day Value_Counts of Business Size')
plt.xlabel('Business Size')
plt.ylabel('Count')
plt.show()


# 1. Identify non-subscription users by summing the subscription_volume per merchant
merchant_subscription_sum = payments_merchants_df.groupby('merchant')['subscription_volume'].sum()

## Filter merchants who have a total subscription_volume of 0
non_subscription_merchants = merchant_subscription_sum[merchant_subscription_sum == 0].index

## Filter the payments_merchants_df to only include  non-subscription merchants
non_subscription_merchants_df = payments_merchants_df[payments_merchants_df['merchant'].isin(non_subscription_merchants)]

# 2. Filter merchants with activity within the last 60 days
## Assign variable for the latest date in the payments df to use as a reference
# last_date = payments_merchants_df['date'].max()  # Already defined in code chunks above

cutoff_days = 60
recent_cutoff_date = last_date - pd.Timedelta(days=cutoff_days) # Cutoff date

# Filter transactions within the last 60 days
recent_transactions_df = non_subscription_merchants_df[non_subscription_merchants_df['date'] >= recent_cutoff_date]

# 3. Calculate total volume during the recent 60-day period for each merchant
recent_activity_total_volume = recent_transactions_df.groupby('merchant')['total_volume'].sum()

# Apply the volume threshold ($1,000 or 100,000 cents)
volume_threshold = 1000  
active_merchants = recent_activity_total_volume[recent_activity_total_volume >= volume_threshold].index # Merchants with total volume >= $1,000 within the past 60 days

filtered_payments_merchants = non_subscription_merchants_df[non_subscription_merchants_df['merchant'].isin(active_merchants)]

# Keep only small and medium business sizes (SMB)
filtered_payments_merchants_df = filtered_payments_merchants[filtered_payments_merchants['business_size'].isin(['small', 'medium'])]

print(filtered_payments_merchants_df.shape)
print(filtered_payments_merchants_df.head())
filtered_payments_merchants_df.describe()

(882324, 10)
                       date  merchant  subscription_volume  checkout_volume  \
0 2041-05-01 00:00:00+00:00  5d03e714                  0.0             0.00   
1 2041-05-01 00:00:00+00:00  da22f154                  0.0             0.00   
4 2041-05-01 00:00:00+00:00  1e719b8a                  0.0           198.95   
5 2041-05-01 00:00:00+00:00  15f61630                  0.0             0.00   
7 2041-05-01 00:00:00+00:00  11f1600c                  0.0             0.00   

   payment_link_volume  total_volume           industry country business_size  \
0                  0.0       4253.40          Education      US        medium   
1                  0.0        173.26             Others      US         small   
4                  0.0        198.95  Business services      IT         small   
5                  0.0       3285.00           Software      US         small   
7                  0.0       1100.00           Software      US         small   

  merchant_age_group  
0             Legacy  
1             Legacy  
4             Legacy  
5             Legacy  
7             Mature


# 1. Recency: Calculate the number of days since the last transaction for each merchant
# Uses last_date as the reference point
recency = filtered_payments_merchants_df.groupby('merchant')['date'].max().apply(lambda x: (last_date - x).days)

# 2. Frequency: Count the number of transaction_volume days for each merchant
frequency = filtered_payments_merchants_df.groupby('merchant')['date'].count()

# 3. Monetary: Sum the total transaction volume for each merchant
monetary = filtered_payments_merchants_df.groupby('merchant')['total_volume'].sum()

# Combine RFM features into a single DataFrame
rfm_payments_df = pd.DataFrame({
    'merchant': recency.index,  # include merchant
    'recency': recency.values,
    'frequency': frequency.values,
    'monetary': monetary.values
})

grouped_filtered_payments_merchants_df = filtered_payments_merchants_df[
    ['merchant', 'industry', 'country', 'business_size', 'merchant_age_group']
].drop_duplicates().reset_index(drop=True)

rfm_payments_df = rfm_payments_df.merge(grouped_filtered_payments_merchants_df, on='merchant', how='inner') 

print(rfm_payments_df.shape)
rfm_payments_df.head()

(6006, 8)


print(rfm_payments_df.frequency.max())
# Setting up the attributes for the boxplot
attributes = ['monetary', 'frequency', 'recency']

# Plot the boxplot for RFM features
plt.rcParams['figure.figsize'] = [10, 8]
sns.boxplot(data=rfm_payments_df[attributes], orient="v", palette="Set2", whis=1.5, saturation=1, width=0.7)
plt.title("RFM Box Plot", fontsize=14, fontweight='bold')
plt.ylabel("Range", fontweight='bold')
plt.xlabel("Attributes", fontweight='bold')

plt.show()


# Plot the distribution of Recency, Frequency, and Monetary values
plt.figure(figsize=(18, 5))

# Recency Distribution
plt.subplot(1, 3, 1)
plt.hist(rfm_payments_df['recency'], bins=30, color='blue', edgecolor='black')
plt.title('Recency Distribution')
plt.xlabel('Days Since Last Transaction')
plt.ylabel('Number of Merchants')

# Frequency Distribution
plt.subplot(1, 3, 2)
plt.hist(rfm_payments_df['frequency'], bins=30, color='green', edgecolor='black')
plt.title('Frequency Distribution')
plt.xlabel('Number of Transactions')
plt.ylabel('Number of Merchants')

# Monetary Distribution
plt.subplot(1, 3, 3)
plt.hist(rfm_payments_df['monetary'], bins=30, color='red', edgecolor='black')
plt.title('Monetary Distribution')
plt.xlabel('Total Volume (Dollars)')
plt.ylabel('Number of Merchants')

plt.tight_layout()
plt.show()

418


# Create a binary feature to indicate whether a merchant had Checkout or Payment Link volume
non_subscription_merchants_df.loc[:,'used_checkout_or_payment_link'] = (
    (non_subscription_merchants_df['checkout_volume'] > 0) | 
    (non_subscription_merchants_df['payment_link_volume'] > 0)
).astype(int)

# Take the max to see whether they EVER used it
merchant_usage_feature = non_subscription_merchants_df.groupby('merchant')['used_checkout_or_payment_link'].max()

merchant_usage_feature_list = merchant_usage_feature.to_frame().reset_index()


# Merge to rfm_payments_df 
rfm_payments_df = rfm_payments_df.merge(merchant_usage_feature_list, on='merchant', how='left')
rfm_payments_df.head(20)

/var/folders/yn/3d5kpq8d1vz3z3354snc3kdr0000gn/T/ipykernel_26772/3659466778.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_subscription_merchants_df.loc[:,'used_checkout_or_payment_link'] = (


print(rfm_payments_df.shape)
print(rfm_payments_df.value_counts('used_checkout_or_payment_link', normalize = True))
rfm_payments_df.value_counts('used_checkout_or_payment_link')

(6006, 9)
used_checkout_or_payment_link
0    0.771895
1    0.228105
Name: proportion, dtype: float64

used_checkout_or_payment_link
0    4636
1    1370
Name: count, dtype: int64


# Scale the three RFM features using StandardScaler while ignoring the rest
ct = ColumnTransformer([
    ('scaler', StandardScaler(), ['recency', 'frequency', 'monetary']),  # Columns to scale
], remainder='passthrough') 

rfm_scaled_array = ct.fit_transform(rfm_payments_df)

# Convert the transformed array back to a DataFrame
rfm_scaled_df = pd.DataFrame(rfm_scaled_array, columns=['recency', 'frequency', 'monetary', 'merchant', 'industry', 'country', 'business_size', 'merchant_age_group', 'used_checkout_or_payment_link'])

# Since the passthrough columns are added at the end, reordering columns for clarity
rfm_scaled_df = rfm_scaled_df[['merchant', 'recency', 'frequency', 'monetary', 'industry', 'country', 'business_size', 'merchant_age_group', 'used_checkout_or_payment_link']]

rfm_scaled_df.head(20)


# Pick cluster for RFM features
num_clusters = 4 
rfm_features = rfm_scaled_df[['recency', 'frequency', 'monetary']]

kmeans = KMeans(n_clusters=num_clusters, max_iter=50, random_state=42)
kmeans.fit(rfm_features)

# Add label to primary DF
rfm_scaled_df['cluster'] = kmeans.labels_
rfm_scaled_df.head()


# Scree Plot
sum_squared_distances = []
range_n_clusters = [2, 3, 4, 5, 6, 7]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50, random_state=42)
    kmeans.fit(rfm_features)  # Use the  RFM features
    
    sum_squared_distances.append(kmeans.inertia_)  # Append the inertia (SSD) for each k

# Plot the SSDs for each number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, sum_squared_distances, marker='o', color='purple')
plt.title('Scree Plot: Choosing the Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances (Inertia)')
plt.show()

# Note: 4 clusters looks good


# Count the number of merchants in each cluster
cluster_counts = rfm_scaled_df['cluster'].value_counts()

# Create the bar plot
plt.bar(cluster_counts.index, cluster_counts.values)

# Add data labels to the bars
for i, count in enumerate(cluster_counts.values):
    plt.text(i, count, str(count), ha='center', va='bottom')

# Set the x-axis labels
plt.xticks(cluster_counts.index)

# Set the title and labels
plt.title('Value Counts of Clusters by Merchant')
plt.xlabel('Cluster')
plt.ylabel('Count')

# Show the plot
plt.show()


sns.boxplot(x='cluster', y='recency', data=rfm_scaled_df)

<Axes: xlabel='cluster', ylabel='recency'>


sns.boxplot(x='cluster', y='monetary', data=rfm_scaled_df)

<Axes: xlabel='cluster', ylabel='monetary'>


sns.boxplot(x='cluster', y='frequency', data=rfm_scaled_df)

<Axes: xlabel='cluster', ylabel='frequency'>


plt.rcParams["figure.figsize"] = (15,5)
colormap = cm.get_cmap('brg')

plt.subplots()
plt.scatter(x=rfm_scaled_df['recency'], y=rfm_scaled_df['monetary'], 
            c=rfm_scaled_df['cluster'], cmap=colormap, alpha=0.5)
plt.xlabel('Recency')
plt.ylabel('Monetary')
# plt.ylim(0, 10)  
plt.title("Clustering: Recency vs Monetary")

plt.subplots()
plt.scatter(x=rfm_scaled_df['frequency'], y=rfm_scaled_df['monetary'], 
            c=rfm_scaled_df['cluster'], cmap=colormap, alpha=0.5)
plt.xlabel('Frequency')
plt.ylabel('Monetary')
# plt.ylim(0, 10)  
plt.title("Clustering: Frequency vs Monetary")

plt.subplots()
plt.scatter(x=rfm_scaled_df['recency'], y=rfm_scaled_df['frequency'], 
            c=rfm_scaled_df['cluster'], cmap=colormap, alpha=0.5)
plt.xlabel('Recency')
plt.ylabel('Frequency')
plt.title("Clustering: Frequency vs Recency")

/var/folders/yn/3d5kpq8d1vz3z3354snc3kdr0000gn/T/ipykernel_26772/2474376421.py:2: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  colormap = cm.get_cmap('brg')

Text(0.5, 1.0, 'Clustering: Frequency vs Recency')


# Set figure size
plt.rcParams["figure.figsize"] = (25, 25)

# Initialize the 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d', elev=16, azim=134)

# Plotting the 3D scatter plot
scatter = ax.scatter(
    rfm_scaled_df['frequency'], 
    rfm_scaled_df['recency'],
    rfm_scaled_df['monetary'],  
    c=rfm_scaled_df['cluster'], 
    s=200, 
    cmap='brg', 
    alpha=0.6, 
    edgecolor='darkgrey'
)

# Labeling the axes
ax.set_xlabel('Frequency', fontsize=15)
ax.set_ylabel('Recency', fontsize=15)
ax.set_zlabel('Monetary', fontsize=15)

# Show plot
plt.show()


# Pick final clusters for targeting
target_clusters = [1, 3]  # Pink is 1, Green is 3

# Filter df for only target clusters
target_merchants_df = rfm_scaled_df[rfm_scaled_df['cluster'].isin(target_clusters)]

output_cols = ['merchant', 'cluster', 'recency', 'frequency', 'monetary', 'used_checkout_or_payment_link']
target_merchants_df = target_merchants_df[output_cols]

target_merchants_df.to_csv('target_merchants.csv', index=False)

target_merchants_df.head(20)

	date	merchant	subscription_volume	checkout_volume	payment_link_volume	total_volume
count	1577887	1577887	1.577887e+06	1.577887e+06	1.577887e+06	1.577887e+06
unique	NaN	23619	NaN	NaN	NaN	NaN
top	NaN	de478470	NaN	NaN	NaN	NaN
freq	NaN	418	NaN	NaN	NaN	NaN
mean	2041-12-02 00:57:55.029073408+00:00	NaN	3.924766e+04	1.101376e+04	1.843901e+03	3.582914e+05
min	2041-05-01 00:00:00+00:00	NaN	0.000000e+00	0.000000e+00	0.000000e+00	1.000000e+00
25%	2041-08-21 00:00:00+00:00	NaN	0.000000e+00	0.000000e+00	0.000000e+00	6.000000e+03
50%	2041-12-05 00:00:00+00:00	NaN	0.000000e+00	0.000000e+00	0.000000e+00	1.987200e+04
75%	2042-03-17 00:00:00+00:00	NaN	0.000000e+00	0.000000e+00	0.000000e+00	7.230000e+04
max	2042-06-22 00:00:00+00:00	NaN	1.083879e+08	3.425907e+07	1.182900e+07	2.243810e+09
std	NaN	NaN	7.253647e+05	1.608574e+05	3.717659e+04	6.488684e+06

	subscription_volume	checkout_volume	payment_link_volume	total_volume
count	882324.0	882324.000000	882324.000000	8.823240e+05
mean	0.0	151.297261	14.904575	2.098112e+03
std	0.0	2054.481887	344.322663	2.188005e+04
min	0.0	0.000000	0.000000	1.000000e-02
25%	0.0	0.000000	0.000000	9.386000e+01
50%	0.0	0.000000	0.000000	2.705600e+02
75%	0.0	0.000000	0.000000	8.867300e+02
max	0.0	342590.690000	46039.270000	1.374812e+07

	merchant	cluster	recency	frequency	monetary	used_checkout_or_payment_link
2	00491f9d	1	-0.551518	1.527334	-0.095694	0
4	0076b5fd	1	-0.551518	1.179021	0.010993	0
5	007f783b	1	-0.551518	2.009042	5.097195	0
6	0093ba4d	1	-0.551518	1.99422	0.296033	0
8	00a43c37	1	-0.551518	1.99422	1.345018	0
12	00cfc07f	1	-0.018761	1.393937	-0.056744	1
14	00ea5a27	1	-0.551518	1.74225	-0.108208	0
15	00ef7320	1	-0.551518	0.771422	0.518071	0
18	012258f6	1	-0.551518	0.712135	0.416945	1
22	0164653b	1	-0.551518	1.038214	-0.052792	0
26	01a8792b	1	-0.462725	0.860353	0.745331	1
27	01b09f0c	1	-0.462725	1.423581	-0.108111	0
29	01d67eee	1	-0.462725	0.400877	-0.087902	1
31	01e85abb	1	-0.551518	1.097501	0.173558	1
32	01f75b05	1	-0.551518	0.638026	-0.118132	0
34	01f9bf03	1	-0.551518	2.009042	0.334932	1
41	021de54a	1	-0.551518	1.66073	0.047791	1
44	0235e81d	1	-0.551518	1.305007	-0.114068	0
46	0243626f	1	-0.551518	1.971987	-0.050573	0
50	026c3371	1	-0.551518	2.001631	-0.068823	0

Customer Segmentation and Targeting Model Showcase¶

Introduction: The Task¶

EDA & Data Cleaning¶

Imports & Data Load¶

Examine Merchants¶

Examine Payments¶

Data Cleaning¶

Clean Payments¶

Clean Merchants¶

Data Wrangling & Feature Engineering¶

Merchant Features¶

Business Size Additional Charts¶

Apply Rule Based Filtering to Payments Data¶

Apply RFM Engineering To Payments¶

Add Feature to RFM for Multi Product Users¶

Scale RFM Features¶

Clustering¶

KMeans¶

Scree Plot¶

Counts Plot of Cluster & Box Plots of Clusters¶

2D RFM Scatterplots¶

3D RFM Scatterplot¶

Export Final Merchant List¶

	merchant	industry	first_charge_date	country	business_size
count	23627	23627	23627	23627	23627
unique	23619	30	2872	44	3
top	0.00E+0	Business services	2041-10-27 00:00:00+00:00	US	small
freq	8	3144	43	8595	22853

	merchant	industry	first_charge_date	country	business_size	merchant_age_mos
0	5d03e714	Education	2032-02-13 00:00:00+00:00	US	medium	124.0
1	da22f154	Others	2031-10-16 00:00:00+00:00	US	small	128.0
2	687eebc8	Software	2032-07-23 00:00:00+00:00	US	small	118.0
3	de478470	Software	2033-03-15 00:00:00+00:00	US	small	111.0
4	1e719b8a	Business services	2035-02-12 00:00:00+00:00	IT	small	88.0

	merchant	recency	frequency	monetary	industry	country	business_size	merchant_age_group
0	001111d7	0	9	18659.95	Transportation & car rental	SK	small	New
1	003f950d	8	9	9126.89	Merchandise	CA	small	New
2	00491f9d	0	353	87327.98	Personal services	IT	small	Mature
3	005392db	1	87	24719.02	Travel & lodging	FR	small	Mature
4	0076b5fd	0	306	333604.62	Digital goods	CA	small	Mature

	merchant	recency	frequency	monetary	industry	country	business_size	merchant_age_group	used_checkout_or_payment_link
0	001111d7	-0.551518	-1.022015	-0.125441	Transportation & car rental	SK	small	New	0
1	003f950d	0.158825	-1.022015	-0.129571	Merchandise	CA	small	New	0
2	00491f9d	-0.551518	1.527334	-0.095694	Personal services	IT	small	Mature	0
3	005392db	-0.462725	-0.443965	-0.122817	Travel & lodging	FR	small	Mature	0
4	0076b5fd	-0.551518	1.179021	0.010993	Digital goods	CA	small	Mature	0
5	007f783b	-0.551518	2.009042	5.097195	Merchandise	US	small	Mature	0
6	0093ba4d	-0.551518	1.99422	0.296033	Transportation & car rental	ES	small	Mature	0
7	009dd2b6	-0.373932	-0.525485	-0.061569	Personal services	IT	small	Mature	1
8	00a43c37	-0.551518	1.99422	1.345018	Business services	US	small	Established	0
9	00a7095f	-0.551518	-0.56254	-0.129096	Personal services	US	small	Established	0
10	00b55ff7	2.112266	-0.984961	-0.124166	Clothing & accessory	AU	small	Legacy	0
11	00c6049c	0.33641	-0.851565	-0.129963	Grocery & food stores	US	small	New	1
12	00cfc07f	-0.018761	1.393937	-0.056744	Personal services	NL	small	Established	1
13	00e3b995	-0.373932	0.082208	-0.123391	Merchandise	FR	small	Mature	0
14	00ea5a27	-0.551518	1.74225	-0.108208	Food & drink	NZ	small	Mature	0
15	00ef7320	-0.551518	0.771422	0.518071	Business services	US	medium	Mature	0
16	00f0c8c8	-0.462725	-0.362445	-0.122046	Cosmetics	ES	small	Mature	0
17	010b0bb3	-0.551518	-0.673703	-0.131442	Merchandise	ES	small	Mature	1
18	012258f6	-0.551518	0.712135	0.416945	Business services	US	small	Mature	1
19	012db28b	-0.107554	-0.695936	-0.106884	Merchandise	HK	small	Mature	0