import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Assuming the file is named 'OnlineRetail.xlsx' and is located in the current directory
df = pd.read_excel('OnlineRetail.xlsx')

# Overview of the data
df.head()

# Information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB

# Statistical metrics
df.describe()

# Number of missing values per column
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

# Since 'CustomerID' has missing values, we could remove those rows (not recommended though, you might end up missing a lot of important data)
df = df.dropna(subset=['CustomerID'])

# Converting 'InvoiceDate' to datetime
df.loc[:, 'InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Checking for negative quantities
df[df['Quantity'] < 0]

# Create a new column 'TotalPrice'
df.loc[:, 'TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Total revenue by country
sales_country = df.groupby('Country')['TotalPrice'].sum().sort_values(ascending=False)

# Visualization
plt.figure(figsize=(12,6))
sales_country.plot(kind='bar')
plt.title('Total revenue by country')
plt.ylabel('Revenue')
plt.show()

# Top 10 products
top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)

# Visualization
plt.figure(figsize=(12,6))
top_products.plot(kind='bar')
plt.title('Top 10 Products by Sales Quantity')
plt.ylabel('Sales Quantity')
plt.show()

# Revenue per month
df.set_index('InvoiceDate', inplace=True)
monthly_sales = df['TotalPrice'].resample('M').sum()

# Visualization
plt.figure(figsize=(12,6))
monthly_sales.plot()
plt.title('Monthly revenue')
plt.ylabel('Revenue')
plt.show()

# Purchases per customer
purchases_per_customer = df.groupby('CustomerID')['InvoiceNo'].nunique().sort_values(ascending=False)

# Visualization of the top 10 customers
purchases_per_customer.head(10).plot(kind='bar')
plt.title('Top 10 customers by number of purchases')
plt.ylabel('number of purchases')
plt.show()

# Calculating revenue per customer
clv = df.groupby('CustomerID')['TotalPrice'].sum().sort_values(ascending=False)

# Visualization of the top 10 customers with the highest customer value
clv.head(10).plot(kind='bar', figsize=(12,6))
plt.title('Top 10 customers by Lifetime Value')
plt.ylabel('Total revenue')
plt.show()

# Check if 'InvoiceDate' is the index
if df.index.name == 'InvoiceDate':
    df.reset_index(inplace=True)  # Reset of the Indexes, so InvoiceDate is considered as a column again

# Number of purchases per customer
repeat_customers = df.groupby('CustomerID')['InvoiceNo'].nunique()

# Calculate average repurchase time per customer
df.loc[:, 'days_between_purchases'] = df.groupby('CustomerID')['InvoiceDate'].diff().dt.days
avg_days_between_purchases = df.groupby('CustomerID')['days_between_purchases'].mean()

# Visualization of the top 10 customers with the most purchases
repeat_customers.sort_values(ascending=False).head(10).plot(kind='bar', figsize=(12,6))
plt.title('Top 10 customers by number of purchases')
plt.ylabel('Number of purchases')
plt.show()

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Data for cluster analysis (e.g., total revenue and number of purchases per customer)
customer_data = df.groupby('CustomerID').agg({
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique'
}).rename(columns={'InvoiceNo': 'NumberOfPurchases'})

# Normalize data
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data)

# K-Means Clustering
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "2"  # Example: Set the number of logical cores to 2

kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)
customer_data['Cluster'] = kmeans.fit_predict(customer_data_scaled)

# Visualization of the clusters
plt.scatter(customer_data['TotalPrice'], customer_data['NumberOfPurchases'], c=customer_data['Cluster'], cmap='viridis')
plt.title('Customer segments based on revenue and number of purchases')
plt.xlabel('Total revenue')
plt.ylabel('Number of purchases')
plt.show()

# Products with the highest revenue
product_sales = df.groupby('Description')['TotalPrice'].sum().sort_values(ascending=False)

# Products with the lowest sales
low_sales_products = df.groupby('Description')['Quantity'].sum().sort_values()

# Visualization of the top 10 highest-revenue products
product_sales.head(10).plot(kind='bar', figsize=(12,6))
plt.title('Top 10 products by total revenue')
plt.ylabel('Revenue')
plt.show()

# Visualization of the 10 lowest-selling products
low_sales_products.head(10).plot(kind='bar', figsize=(12,6))
plt.title('10 products with the lowest numbers of sales')
plt.ylabel('Sold quantity')
plt.show()

# Analysis of cancellations (negative quantities)
cancellations = df[df['Quantity'] < 0]

# Top products with the most cancellations
top_cancellations = cancellations.groupby('Description')['Quantity'].sum().sort_values()

# Visualization of the top 10 products with the most cancellations
top_cancellations.head(10).plot(kind='bar', figsize=(12,6))
plt.title('Top 10 products with the most cancellations')
plt.ylabel('Cancelled quantities')
plt.show()

# Revenue by country
sales_country = df.groupby('Country')['TotalPrice'].sum().sort_values(ascending=False)

# Visualization of the top 10 countries by revenue
sales_country.head(10).plot(kind='bar', figsize=(12,6))
plt.title('Top 10 countries by revenue')
plt.ylabel('Revenue')
plt.show()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom

	Quantity	InvoiceDate	UnitPrice	CustomerID
count	541909.000000	541909	541909.000000	406829.000000
mean	9.552250	2011-07-04 13:34:57.156386048	4.611114	15287.690570
min	-80995.000000	2010-12-01 08:26:00	-11062.060000	12346.000000
25%	1.000000	2011-03-28 11:34:00	1.250000	13953.000000
50%	3.000000	2011-07-19 17:17:00	2.080000	15152.000000
75%	10.000000	2011-10-19 11:27:00	4.130000	16791.000000
max	80995.000000	2011-12-09 12:50:00	38970.000000	18287.000000
std	218.081158	NaN	96.759853	1713.600303

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
141	C536379	D	Discount	-1	2010-12-01 09:41:00	27.50	14527.0	United Kingdom
154	C536383	35004C	SET OF 3 COLOURED FLYING DUCKS	-1	2010-12-01 09:49:00	4.65	15311.0	United Kingdom
235	C536391	22556	PLASTERS IN TIN CIRCUS PARADE	-12	2010-12-01 10:24:00	1.65	17548.0	United Kingdom
236	C536391	21984	PACK OF 12 PINK PAISLEY TISSUES	-24	2010-12-01 10:24:00	0.29	17548.0	United Kingdom
237	C536391	21983	PACK OF 12 BLUE PAISLEY TISSUES	-24	2010-12-01 10:24:00	0.29	17548.0	United Kingdom
...	...	...	...	...	...	...	...	...
540449	C581490	23144	ZINC T-LIGHT HOLDER STARS SMALL	-11	2011-12-09 09:57:00	0.83	14397.0	United Kingdom
541541	C581499	M	Manual	-1	2011-12-09 10:28:00	224.69	15498.0	United Kingdom
541715	C581568	21258	VICTORIAN SEWING BOX LARGE	-5	2011-12-09 11:57:00	10.95	15311.0	United Kingdom
541716	C581569	84978	HANGING HEART JAR T-LIGHT HOLDER	-1	2011-12-09 11:58:00	1.25	17315.0	United Kingdom
541717	C581569	20979	36 PENCILS TUBE RED RETROSPOT	-5	2011-12-09 11:58:00	1.25	17315.0	United Kingdom

Overview of the Online Retail Dataset¶

The Online Retail dataset is a transaction dataset for a British online retailer. It contains all transactions between 01.12.2010 and 09.12.2011. The dataset includes the following information:¶

InvoiceNo¶

StockCode¶

Description¶

Quantity¶

InvoiceDate¶

UnitPrice¶

CustomerID¶

Country¶

Possible Analyses¶

With this dataset, we can conduct various analyses, such as:¶

Sales Analysis¶

Customer Analysis¶

Product Analysis¶

Steps for an Exploratory Data Analysis (EDA)¶

Data Import and Initial Inspection¶

Data Cleaning¶

Descriptive Statistics¶

Data Visualization¶

Feature Engineering¶

Gaining Insights¶

1. Importing the necessary libraries¶

Explanation:¶

2. Loading the Dataset¶

3. Initial Data Inspection¶

Explanation:¶

4. Checking for Missing Values¶

Explanation:¶

5. Handling Missing Values¶

Explanation:¶

6. Converting Data Types¶

Explanation:¶

7. Handling Negative Values¶

Explanation:¶

8. Calculating Total Price¶

Explanation:¶

9. Data Visualization¶

9.1. Revenue by Country¶

Explanation:¶

9.2. Top 10 Products by Sales Quantity¶

Explanation:¶

9.3. Revenue Over Time¶

Explanation:¶

10. Customer Analysis¶

10.1. Number of Purchases per Customer¶

Explanation:¶

11. Summary of Findings¶

Main Markets:¶

Sales Trends:¶

Top Products:¶

Customer Behavior:¶

12. Customer Lifetime Value (CLV)¶

13. Customer Retention and Repeat Purchases¶

14. Customer Segmentation (Clustering)¶

15. Product and Assortment Optimization¶

16. Cancellation Analysis¶

17. International Expansion¶