import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_excel("Online Retail.xlsx")

df.head(5)

df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

df.count()

InvoiceNo      541909
StockCode      541909
Description    540455
Quantity       541909
InvoiceDate    541909
UnitPrice      541909
CustomerID     406829
Country        541909
dtype: int64

df.shape

(541909, 8)

df.nunique()

InvoiceNo      25900
StockCode       4070
Description     4223
Quantity         722
InvoiceDate    23260
UnitPrice       1630
CustomerID      4372
Country           38
dtype: int64

duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (5268, 8)

df = df.drop_duplicates()

df.shape

(536641, 8)

# Create new columns
df['week'] = df['InvoiceDate'].dt.isocalendar().week  # week number (48)
df['weekday'] = df['InvoiceDate'].dt.day_name() # day of the week ('Tuesday')
df['month'] = df['InvoiceDate'].dt.strftime('%m') # month number (11)

df['week_year'] = df['InvoiceDate'].dt.strftime('%Y-W%V') # (2020-W12)
df['month_year'] = df['InvoiceDate'].dt.strftime('%Y-%m') # (2020-08)
df['quarter_year'] = df['InvoiceDate'].dt.to_period('Q').dt.strftime('%Y-Q%q') #(2020-Q3)
df['year'] = df['InvoiceDate'].dt.strftime('%Y') # (2020)

df['Sales'] = df['Quantity'] * df['UnitPrice']  # transaction sales

print(df.isnull().sum())

InvoiceNo            0
StockCode            0
Description       1454
Quantity             0
InvoiceDate          0
UnitPrice            0
CustomerID      135037
Country              0
week                 0
weekday              0
month                0
week_year            0
month_year           0
quarter_year         0
year                 0
Sales                0
dtype: int64

## Drop NaN
# df = df.dropna()

df['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

df.describe()

# Calculate total count of transactions for each weekday
df[['weekday','Quantity', 'Sales']].groupby(['weekday']).agg(['sum', 'mean'])

# Select only numeric columns
numeric_df = df.select_dtypes(include='number')

# Plot the correlation heatmap
plt.figure(figsize=(10, 5))
heatmap = sns.heatmap(numeric_df.corr(), vmin=-1, vmax=1, annot=True, cmap=sns.color_palette("crest", as_cmap=True))
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12)
plt.show()

plt.hist(df['Sales'])
plt.show()

#Scatter plot of Price and Quantity
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['UnitPrice'], df['Quantity'])
ax.set_xlabel('Price')
ax.set_ylabel('Quantity')
plt.show()

df['month'] = df['month'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 536641 entries, 0 to 541908
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   InvoiceNo     536641 non-null  object        
 1   StockCode     536641 non-null  object        
 2   Description   535187 non-null  object        
 3   Quantity      536641 non-null  int64         
 4   InvoiceDate   536641 non-null  datetime64[ns]
 5   UnitPrice     536641 non-null  float64       
 6   CustomerID    401604 non-null  float64       
 7   Country       536641 non-null  object        
 8   week          536641 non-null  UInt32        
 9   weekday       536641 non-null  object        
 10  month         536641 non-null  object        
 11  week_year     536641 non-null  object        
 12  month_year    536641 non-null  object        
 13  quarter_year  536641 non-null  object        
 14  year          536641 non-null  object        
 15  Sales         536641 non-null  float64       
dtypes: UInt32(1), datetime64[ns](1), float64(3), int64(1), object(10)
memory usage: 68.1+ MB

df.head(5)

# Group data by month and sum
df_by_month = df.groupby(['month'])[['Quantity', 'UnitPrice', 'CustomerID','week','Sales']].sum().reset_index()

df_by_month.head()

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_month['month'], height = df_by_month['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Month')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by month')
plt.show()

# Group data by week and sum
df_by_week = df.groupby(['week'])[['Quantity', 'UnitPrice', 'CustomerID','Sales']].sum().reset_index()
df_by_week.head()

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_week['week'], height = df_by_week['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Week')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by week')
plt.show()

# Group data by day and sum
df_by_dayofweek = df.groupby(['weekday'])[['Quantity', 'UnitPrice', 'CustomerID','week','Sales']].sum().reset_index()
df_by_dayofweek.head()

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_dayofweek['weekday'], height = df_by_dayofweek['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Weekday')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by weekday')
plt.show()

# Group data
df_by_week_year = df.groupby(['week_year', 'year', 'week'])[['Quantity', 'UnitPrice', 'CustomerID','Sales']].sum().reset_index()
df_by_week_year.head()

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_week_year['week_year'], height = df_by_week_year['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('Week')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by week')
plt.xticks(rotation = 45, fontsize = 8) # Rotate x-axis labels and decrease font size
plt.show()

plt.figure(figsize = (15, 5))
p = sns.barplot(
    data = df_by_week_year,
    x = 'week',
    y = 'Quantity',
    hue = 'year')
for b in p.patches:
    p.annotate(str(round(b.get_height()/1000000, 1))+'M', 
                   (b.get_x() + b.get_width() / 2., b.get_height() + 1.2e6), 
                   ha = 'center', va = 'bottom', 
                   xytext = (0, -12), 
                   textcoords = 'offset points')
p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.xlabel("week")
plt.ylabel("Quantity")
plt.title("Quantity per week")
plt.show()

/var/folders/74/28twp8_x1y1fghxndbf6mnxr0000gn/T/ipykernel_84815/34411050.py:13: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')

# Group data by month and sum
df_by_country = df.groupby(['Country'])[['Quantity', 'UnitPrice', 'CustomerID','week','Sales']].sum().reset_index()
df_by_country.head()

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_country['Country'], height = df_by_country['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('country')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by country')
plt.xticks(rotation = 45, fontsize = 8) # Rotate x-axis labels and decrease font size
plt.show()

df_by_country_sorted = df_by_country.sort_values(by='Quantity', ascending=False)
print(df_by_country_sorted.head(5))

           Country  Quantity    UnitPrice    CustomerID      week        Sales
36  United Kingdom   4250328  2233247.604  5.544907e+09  15089038  8167128.184
24     Netherlands    200128     6492.550  3.419054e+07     65340   284661.540
10            EIRE    142495    48400.370  1.102464e+08    256550   262993.380
14         Germany    117341    37633.440  1.198819e+08    280589   221509.470
13          France    110438    42985.980  1.074419e+08    260710   197317.110

plt.figure(figsize = (15, 5))
plt.bar(x = df_by_country_sorted['Country'], height = df_by_country_sorted['Quantity'])
# addlabels(df_by_month['quarter'], df_by_month['Quantity'], df_by_month['number_of_strikes_formatted'])
plt.plot()
plt.xlabel('country')
plt.ylabel('Quantity')
plt.title('Quantity of items sold by country')
plt.xticks(rotation = 90, fontsize = 8) # Rotate x-axis labels and decrease font size
plt.show()

# Pareto of top products
df['StockCode'].value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of Product Transactions")
plt.ylabel('Number of Transactions')
plt.xlabel('StockCode');

df['StockCode'].value_counts()[:20].rename_axis('Product unique_values').reset_index(name='counts').style.background_gradient()

# Define order of days for the plot
weekday_order = ['Monday','Tuesday', 'Wednesday', 'Thursday','Friday','Saturday','Sunday']

# Create boxplots of strike counts for each day of week
g = sns.boxplot(data=df, 
            x='weekday',
            y='Quantity', 
            order=weekday_order, 
            showfliers=False 
            );
g.set_title('Quantity per weekday');

sns.boxplot(x=df['Quantity'])

<Axes: xlabel='Quantity'>

sns.boxplot(x=df['UnitPrice'])

<Axes: xlabel='UnitPrice'>

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom

	Quantity	InvoiceDate	UnitPrice	CustomerID	week	Sales
count	536641.000000	536641	536641.000000	401604.000000	536641.0	536641.000000
mean	9.620029	2011-07-04 08:57:06.087421952	4.632656	15281.160818	30.694116	18.123861
min	-80995.000000	2010-12-01 08:26:00	-11062.060000	12346.000000	1.0	-168469.600000
25%	1.000000	2011-03-28 10:52:00	1.250000	13939.000000	18.0	3.750000
50%	3.000000	2011-07-19 14:04:00	2.080000	15145.000000	34.0	9.870000
75%	10.000000	2011-10-18 17:05:00	4.130000	16784.000000	45.0	17.400000
max	80995.000000	2011-12-09 12:50:00	38970.000000	18287.000000	51.0	168469.600000
std	219.130156	NaN	97.233118	1714.006089	15.145369	380.656263

	Quantity		Sales
	sum	mean	sum	mean
weekday
Friday	793106	9.723607	1537958.621	18.855620
Monday	813178	8.610981	1584895.301	16.782923
Sunday	462263	7.360173	798659.461	12.716292
Thursday	1165600	11.310356	2108701.530	20.461706
Tuesday	961133	9.510142	1965703.611	19.450087
Wednesday	967222	10.320888	1730088.430	18.461169

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country	week	weekday	month	week_year	month_year	quarter_year	year	Sales
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom	48	Wednesday	12	2010-W48	2010-12	2010-Q4	2010	15.30
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom	48	Wednesday	12	2010-W48	2010-12	2010-Q4	2010	20.34
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom	48	Wednesday	12	2010-W48	2010-12	2010-Q4	2010	22.00
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom	48	Wednesday	12	2010-W48	2010-12	2010-Q4	2010	20.34
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom	48	Wednesday	12	2010-W48	2010-12	2010-Q4	2010	20.34

	month	Quantity	UnitPrice	CustomerID	week	Sales
0	01	308281	172003.690	328108845.0	91319	558448.560
1	02	277374	126841.950	308233342.0	186847	497026.410
2	03	351165	170778.300	417520175.0	404444	682013.980
3	04	288237	128689.461	350844921.0	450484	492367.841
4	05	379652	190058.090	436143875.0	718209	722094.100

Portfolio Project: Online Retail Exploratory Data Analysis with Python¶

Overview¶

Project Objectives¶

1-Load the dataset into a Pandas DataFrame and display the first few rows to get an overview of the data.¶

2-Perform data cleaning by handling missing values, if any, and removing any redundant or unnecessary columns.¶

Check for duplicated rows¶

Missing data¶

Validate country list¶

3-Explore the basic statistics of the dataset, including measures of central tendency and dispersion.¶

4-Data Visualizations¶

5-Analyze the sales trends over time. Identify the busiest months and days of the week in terms of sales.¶

6-Explore the top-selling products and countries based on the quantity sold.¶

7-Identify any outliers or anomalies in the dataset and discuss their potential impact on the analysis.¶

8-Conclusions¶

	week	Quantity	UnitPrice	CustomerID	Sales
0	1	73400	60759.66	80429840.0	93493.71
1	2	85461	34523.48	72357440.0	190624.87
2	3	67842	38332.19	72651092.0	133476.74
3	4	69019	32203.63	86679824.0	118500.24
4	5	67532	28560.37	78620603.0	123498.86

	weekday	Quantity	UnitPrice	CustomerID	week	Sales
0	Friday	793106	394996.251	8.410658e+08	2526903	1537958.621
1	Monday	813178	533925.471	1.003218e+09	2907417	1584895.301
2	Sunday	462263	181971.301	9.526538e+08	1987460	798659.461
3	Thursday	1165600	424344.820	1.244651e+09	3140260	2108701.530
4	Tuesday	961133	519550.191	1.028613e+09	3036554	1965703.611

	week_year	year	week	Quantity	UnitPrice	CustomerID	Sales
0	2010-W48	2010	48	78647	38240.08	118958432.0	181088.89
1	2010-W49	2010	49	117965	143755.50	150595338.0	269518.44
2	2010-W50	2010	50	105191	50909.41	110543387.0	206397.01
3	2010-W51	2010	51	39235	26329.70	28635805.0	89719.27
4	2011-W01	2011	1	73400	60759.66	80429840.0	93493.71

	Country	Quantity	UnitPrice	CustomerID	week	Sales
0	Australia	83643	4048.00	15680571.0	30709	137009.77
1	Austria	4827	1701.52	5021102.0	12568	10154.32
2	Bahrain	260	86.57	210027.0	379	548.40
3	Belgium	23152	7540.13	25718288.0	62106	40910.96
4	Brazil	356	142.60	408608.0	480	1143.60

	Product unique_values	counts
0	85123A	2301
1	22423	2192
2	85099B	2156
3	47566	1720
4	20725	1626
5	84879	1489
6	22720	1469
7	22197	1468
8	21212	1367
9	22383	1328
10	20727	1323
11	22457	1272
12	23203	1260
13	POST	1256
14	22386	1245
15	22469	1232
16	22960	1221
17	21931	1211
18	22086	1194
19	22411	1192