A comprehensive analysis of market sales

TBD
R
WORK-IN-PROGRESS
time series
Author
Published

November 19, 2024

WORK IN PROGRESS

Time Series Analysis

We’ll be using a sales dataset from a supermakert chain source.

Code
dataset <- readr::read_csv("data.csv")
head(dataset) |> 
  kableExtra::kable("html") 
Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total Date Time Payment cogs gross margin percentage gross income Rating
750-67-8428 A Yangon Member Female Health and beauty 74.69 7 26.1415 548.9715 1/5/2019 13:08:00 Ewallet 522.83 4.761905 26.1415 9.1
226-31-3081 C Naypyitaw Normal Female Electronic accessories 15.28 5 3.8200 80.2200 3/8/2019 10:29:00 Cash 76.40 4.761905 3.8200 9.6
631-41-3108 A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 340.5255 3/3/2019 13:23:00 Credit card 324.31 4.761905 16.2155 7.4
123-19-1176 A Yangon Member Male Health and beauty 58.22 8 23.2880 489.0480 1/27/2019 20:33:00 Ewallet 465.76 4.761905 23.2880 8.4
373-73-7910 A Yangon Normal Male Sports and travel 86.31 7 30.2085 634.3785 2/8/2019 10:37:00 Ewallet 604.17 4.761905 30.2085 5.3
699-14-3026 C Naypyitaw Normal Male Electronic accessories 85.39 7 29.8865 627.6165 3/25/2019 18:30:00 Ewallet 597.73 4.761905 29.8865 4.1

Let’s have a look at what the data looks like

Code
dplyr::glimpse(dataset)
Rows: 1,000
Columns: 17
$ `Invoice ID`              <chr> "750-67-8428", "226-31-3081", "631-41-3108",…
$ Branch                    <chr> "A", "C", "A", "A", "A", "C", "A", "C", "A",…
$ City                      <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "…
$ `Customer type`           <chr> "Member", "Normal", "Normal", "Member", "Nor…
$ Gender                    <chr> "Female", "Female", "Male", "Male", "Male", …
$ `Product line`            <chr> "Health and beauty", "Electronic accessories…
$ `Unit price`              <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39, 68…
$ Quantity                  <dbl> 7, 5, 7, 8, 7, 7, 6, 10, 2, 3, 4, 4, 5, 10, …
$ `Tax 5%`                  <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
$ Total                     <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.3…
$ Date                      <chr> "1/5/2019", "3/8/2019", "3/3/2019", "1/27/20…
$ Time                      <time> 13:08:00, 10:29:00, 13:23:00, 20:33:00, 10:…
$ Payment                   <chr> "Ewallet", "Cash", "Credit card", "Ewallet",…
$ cogs                      <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.7…
$ `gross margin percentage` <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.76…
$ `gross income`            <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
$ Rating                    <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1, 5.8, 8.0, 7.2,…

What are these folks buying?

Let’s check what are people buying…

Code
library(ggplot2)
library(cowplot)
library(dplyr)
library(hrbrthemes)
source("helper_functions.R")

# Let's aggregate the data
data_product <- dataset |> 
  mutate(
    formatted_date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, formatted_date, `Product line`) |>
  summarise(
    n_products_sold = sum(Quantity),
    .groups = "drop"
  )

# A function that we can use to plot the sales for the products at any branch
stream_plot <- function(data, branch) {
  data |> 
  filter(Branch == branch) |> 
  ggplot(aes(x = formatted_date, y = n_products_sold, fill = `Product line`)) +
  geom_area() +
  scale_fill_manual(values= c("#6d2f20", "#df7e66", "#e09351", "#edc775", "#94b594", "#224b5e")) +
  theme_ipsum() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()
        ) +
  labs(x = "", y = "") +
  ylim(0, 65)
}

# Lets create the three individual plots
plot_a <- stream_plot(data_product, "A") + 
      theme(legend.position = "none")
plot_b <- stream_plot(data_product, "B")+ 
      theme(legend.position = "none")
plot_c <- stream_plot(data_product, "C") +
  theme(
    legend.position = c(0.52, -0.25),
    legend.direction = "horizontal",
    legend.text = element_blank(),
    legend.title = element_blank(),
    legend.key.height = unit(0.75, "cm"),
    legend.key.width = unit(3.1, "cm")
  ) +
  guides(fill = guide_legend(nrow = 1)) 
  
# Putting everything together
ggdraw(plot_grid(plot_a, plot_b, plot_c, ncol=1)) +
  theme(
    plot.background = element_rect(fill="#fbf7f0", color="#fbf7f0"),
    plot.margin = margin(50, 30, 10)
  ) +
  draw_text(text = "# Products sold daily", x=0.5, y=1.06, fontface="bold", size= 30, family=font1, color="#224b5e") +
  draw_text(text="Jan", x=0.15, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Feb", x=0.4, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Mar", x=0.65, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Apr", x=0.9, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Electronics", x=0.11, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Fashion", x=0.28, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Food", x=0.45, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Health", x=0.62, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Home", x=0.79, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Sports", x=0.97, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Branch A", x=0.96, y=0.84, size=20, family=font1, hjust=0, color="#224b5e") +
  draw_text(text="Branch B", x=0.96, y=0.505, size=20, family=font1, hjust=0, color="#224b5e") +
  draw_text(text="Branch C", x=0.96, y=0.17, size=20, family=font1, hjust=0, color="#224b5e") 

It can be also interesting to see what are the products being bought between genders. Let’s check first the number of males and females in each branch.

Code
dataset |> 
  group_by(Branch, Gender) |> 
  summarise(n = n(), .groups = "drop") |> 
  group_by(Branch) |> 
  arrange(desc(Gender)) |> 
  mutate(proportion = paste0(round(n / sum(n) *100, 1), "%")) |> 
  mutate(ypos = cumsum(n) - 0.5*n ) |> 
  ggplot(aes(x = Branch, y = n, fill = Gender)) +
  geom_col() +
  scale_fill_manual(values= c("#8bdcc9",  "#fe957f")) +
  custom_theme() +
  geom_text(aes(y = ypos, label = proportion), color = "white", size = 5) +
  labs(y = "", title = "Gender summary per branch")

Code
data_product_gender <- dataset |> 
  mutate(
    date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, `Product line`, Gender) |>
  summarise(
    n_products_sold = sum(Quantity),
    .groups = "drop"
  )

Daily sales

Code
library(dplyr)
library(lubridate)

summary <- dataset |> 
  mutate(
    formatted_date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, formatted_date) |>
  summarise(
    sum_sales = sum(Total),
    .groups = "drop"
  )

summary |> 
  head() |> 
  kableExtra::kable("html") 
Branch formatted_date sum_sales
A 2019-01-01 2371.3200
A 2019-01-02 307.0515
A 2019-01-03 937.4085
A 2019-01-04 483.2625
A 2019-01-05 2024.5050
A 2019-01-06 1309.9695

We can use ggplot2 for visualising the data

Code
library(ggplot2)
library(dplyr)
library(viridis)

# I'll load a custom theme

source("helper_functions.R")

temp_summary <- summary |> 
  mutate(name_branch = Branch)

temp_summary |> 
  ggplot(aes(x = formatted_date, y = sum_sales)) +
    geom_line( data=temp_summary %>% dplyr::select(-Branch), aes(group=name_branch), color="grey", size=0.5, alpha=0.5) +
    geom_line( aes(color = Branch), color="#69b3a2", size=1.2 )+
    scale_color_viridis(discrete = TRUE) +
    custom_theme() +
    theme(
      legend.position="none",
      plot.title = element_text(size=14),
      axis.title.y = element_blank(),
      axis.title.x = element_blank()
    ) +
    labs(
      title = "Daily Sales ($)",
      subtitle = "Daily sales in the first quarter of 2019 in three different branches",
      caption = "https://svalvaro.github.io/"
      
    ) + 
    facet_wrap(~Branch, ncol = 1)

Let’s add some interactivity and use the plotly library