#import the library functions to write SQLqueries on jupyter:
from mysql.connector import connect, cursor


#import libraries:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


#create connector and cursor:
conector = connect(host='localhost',
                  user='root',
                  password='Musica10',
                  database='ecommerce_db')

cursor = conector.cursor()


#write commandline:
Line1 ='''
SELECT sum(p.payment_value) as total_spent, o.customer_id, count(o.customer_id) as transactions, min(o.order_purchase_timestamp) as first_purchase, o.order_status
FROM orders o
LEFT JOIN payments p ON o.order_id = p.order_id
GROUP BY o.customer_id
HAVING o.order_status IN ('delivered', 'ship%', 'inv%', 'approved', 'created')
ORDER BY total_spent DESC;
'''
cursor.execute(Line1)


#generate DF and close connectors:
df = cursor.fetchall()

cursor.close()
conector.close()


#convert to pd df

Pd_df = pd.DataFrame(df, columns=['total_spent', 'customer_id','transactions', 'first_purchase', 'order_status'])


#Check the data shape as dtypes:
print('data shape: (rows, cols):{}'.format(Pd_df.shape))
Pd_df.dtypes

data shape: (rows, cols):(96485, 5)

total_spent               object
customer_id               object
transactions               int64
first_purchase    datetime64[ns]
order_status              object
dtype: object


#Correct total spent type:
df = Pd_df.copy()
df['total_spent'].astype('float64')

0        13664.08
1         7274.88
2         6929.31
3         6922.21
4         6726.66
           ...   
96480       11.56
96481       10.89
96482       10.07
96483        9.59
96484         NaN
Name: total_spent, Length: 96485, dtype: float64


#quick view over DataFrame:
df.head(7)


plt.title('Amount spent by client distribution')
plt.xlim(right = 600)
sns.histplot(data = df['total_spent'])

<AxesSubplot:title={'center':'Amount spent by client distribution'}, xlabel='total_spent', ylabel='Count'>


#Fill empty values on dataframe
df['total_spent'].fillna(value = 0, inplace = True)


#get the dates between first purchase and last date in dataset 
last_date = (df['first_purchase'].max() - df['first_purchase'].min())/np.timedelta64(1,'D')
dates_btwn = list(((df['first_purchase'].max() - df['first_purchase'])/np.timedelta64(1,'D'))/last_date)

#Create column to store #purchases * dates_between
df['normalized_transactions'] = df['transactions'] * dates_btwn
df['normalized_transactions'].describe()

count    96485.000000
mean         0.353013
std          0.290259
min          0.000000
25%          0.164398
50%          0.314123
75%          0.500727
max         16.338439
Name: normalized_transactions, dtype: float64


sns.histplot(data = df['normalized_transactions'])

<AxesSubplot:xlabel='normalized_transactions', ylabel='Count'>


df.rename(columns={'normalized_transactions': 'normalized_frequency'}, inplace = True)
df.drop(df[df['normalized_frequency'] > 1.18].index, inplace = True)


df_new = df.drop(df[df['total_spent'] > 400].index)


from sklearn.cluster import KMeans

#Define the feature(s) to be labeled
feature = df_new[['customer_id', 'total_spent', 'normalized_frequency']]

#feature = np.array(feature).reshape(-1,1)
# Keep track of results
track = {}
labels = {}

# Find the optimal K
for k in range(2, 10):
    model = KMeans(n_clusters= k, random_state = 1)
    model.fit(feature)
    track[k] = model.inertia_
    labels[k] = model.labels_


#Plot results to find optimal k:
plt.figure(figsize=(12,6))
plt.title('Elbow plot')
plt.xlabel('K')
plt.ylabel('Sum Squared Distance')
plt.plot(track.keys(), track.values())

[<matplotlib.lines.Line2D at 0x263b1100040>]


df_new['labels'] = labels[5]


plt.figure(figsize=(12,6))
plt.title('Customer Clusterization by amount spent VS relative frequency of purchase')
sns.scatterplot(x = df_new['normalized_frequency'], y = df_new['total_spent'], hue = labels[5], palette = 'tab10')

<AxesSubplot:title={'center':'Customer Clusterization by amount spent VS relative frequency of purchase'}, xlabel='normalized_frequency', ylabel='total_spent'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:132: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)


df_new['labels'] = df_new['labels'].rename({3: 'class A',
                                            1: 'class B',
                                            2: 'class C',
                                            0: 'class D',
                                            4: 'class E'})


conector = connect(host='localhost',
                  user='root',
                  password='Musica10',
                  database='ecommerce_db')

cursor = conector.cursor()

####

Line2 =f'''
ALTER TABLE customers
ADD COLUMN labels CHAR(1);
'''
cursor.execute(Line2)
for lab in df_new['labels']:
    Line3 = f'''
    UPDATE customers
    SET labels = "{lab}"
    WHERE customer_id = "{df_new['customer_id']}"
    '''
    cursor.execute(Line3)
conector.commit()

####

cursor.close()
conector.close()

Customer Clustering using K means 📊¶

1. Overview¶

About the Dataset:¶

2. EDA¶

3. KMeans clusterization¶

4. Conclusion¶

	total_spent	customer_id	transactions	first_purchase	order_status
0	13664.08	1617b1357756262bfa56ab541c47bc16	1	2017-09-29 15:24:52	delivered
1	7274.88	ec5b2ba62e574342386871631fafd3fc	1	2018-07-15 14:49:44	delivered
2	6929.31	c6e2731c5b391845f6800c97401a43a9	1	2017-02-12 20:37:36	delivered
3	6922.21	f48d464a0baaea338cb25f816991ab1f	1	2018-07-25 18:10:17	delivered
4	6726.66	3fd6777bbce08a352fddd04e4a7cc8f6	1	2017-05-24 18:14:34	delivered
5	6081.54	05455dfa7cd02f13d132aa7a6a9729c6	1	2017-11-24 11:03:35	delivered
6	4950.34	df55c14d1476a9a3467f131269c2477f	1	2017-04-01 15:58:40	delivered