WORK IN PROGRESS

Time Series Analysis

We’ll be using a sales dataset from a supermakert chain source.

Code

dataset <- readr::read_csv("data.csv")
head(dataset) |> 
  kableExtra::kable("html")

Invoice ID	Branch	City	Customer type	Gender	Product line	Unit price	Quantity	Tax 5%	Total	Date	Time	Payment	cogs	gross margin percentage	gross income	Rating
750-67-8428	A	Yangon	Member	Female	Health and beauty	74.69	7	26.1415	548.9715	1/5/2019	13:08:00	Ewallet	522.83	4.761905	26.1415	9.1
226-31-3081	C	Naypyitaw	Normal	Female	Electronic accessories	15.28	5	3.8200	80.2200	3/8/2019	10:29:00	Cash	76.40	4.761905	3.8200	9.6
631-41-3108	A	Yangon	Normal	Male	Home and lifestyle	46.33	7	16.2155	340.5255	3/3/2019	13:23:00	Credit card	324.31	4.761905	16.2155	7.4
123-19-1176	A	Yangon	Member	Male	Health and beauty	58.22	8	23.2880	489.0480	1/27/2019	20:33:00	Ewallet	465.76	4.761905	23.2880	8.4
373-73-7910	A	Yangon	Normal	Male	Sports and travel	86.31	7	30.2085	634.3785	2/8/2019	10:37:00	Ewallet	604.17	4.761905	30.2085	5.3
699-14-3026	C	Naypyitaw	Normal	Male	Electronic accessories	85.39	7	29.8865	627.6165	3/25/2019	18:30:00	Ewallet	597.73	4.761905	29.8865	4.1

Let’s have a look at what the data looks like

Code

dplyr::glimpse(dataset)

Rows: 1,000
Columns: 17
$ `Invoice ID`              <chr> "750-67-8428", "226-31-3081", "631-41-3108",…
$ Branch                    <chr> "A", "C", "A", "A", "A", "C", "A", "C", "A",…
$ City                      <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "…
$ `Customer type`           <chr> "Member", "Normal", "Normal", "Member", "Nor…
$ Gender                    <chr> "Female", "Female", "Male", "Male", "Male", …
$ `Product line`            <chr> "Health and beauty", "Electronic accessories…
$ `Unit price`              <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39, 68…
$ Quantity                  <dbl> 7, 5, 7, 8, 7, 7, 6, 10, 2, 3, 4, 4, 5, 10, …
$ `Tax 5%`                  <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
$ Total                     <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.3…
$ Date                      <chr> "1/5/2019", "3/8/2019", "3/3/2019", "1/27/20…
$ Time                      <time> 13:08:00, 10:29:00, 13:23:00, 20:33:00, 10:…
$ Payment                   <chr> "Ewallet", "Cash", "Credit card", "Ewallet",…
$ cogs                      <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.7…
$ `gross margin percentage` <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.76…
$ `gross income`            <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
$ Rating                    <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1, 5.8, 8.0, 7.2,…

What are these folks buying?

Let’s check what are people buying…

Code

library(ggplot2)
library(cowplot)
library(dplyr)
library(hrbrthemes)
source("helper_functions.R")

# Let's aggregate the data
data_product <- dataset |> 
  mutate(
    formatted_date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, formatted_date, `Product line`) |>
  summarise(
    n_products_sold = sum(Quantity),
    .groups = "drop"
  )

# A function that we can use to plot the sales for the products at any branch
stream_plot <- function(data, branch) {
  data |> 
  filter(Branch == branch) |> 
  ggplot(aes(x = formatted_date, y = n_products_sold, fill = `Product line`)) +
  geom_area() +
  scale_fill_manual(values= c("#6d2f20", "#df7e66", "#e09351", "#edc775", "#94b594", "#224b5e")) +
  theme_ipsum() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()
        ) +
  labs(x = "", y = "") +
  ylim(0, 65)
}

# Lets create the three individual plots
plot_a <- stream_plot(data_product, "A") + 
      theme(legend.position = "none")
plot_b <- stream_plot(data_product, "B")+ 
      theme(legend.position = "none")
plot_c <- stream_plot(data_product, "C") +
  theme(
    legend.position = c(0.52, -0.25),
    legend.direction = "horizontal",
    legend.text = element_blank(),
    legend.title = element_blank(),
    legend.key.height = unit(0.75, "cm"),
    legend.key.width = unit(3.1, "cm")
  ) +
  guides(fill = guide_legend(nrow = 1)) 
  
# Putting everything together
ggdraw(plot_grid(plot_a, plot_b, plot_c, ncol=1)) +
  theme(
    plot.background = element_rect(fill="#fbf7f0", color="#fbf7f0"),
    plot.margin = margin(50, 30, 10)
  ) +
  draw_text(text = "# Products sold daily", x=0.5, y=1.06, fontface="bold", size= 30, family=font1, color="#224b5e") +
  draw_text(text="Jan", x=0.15, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Feb", x=0.4, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Mar", x=0.65, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Apr", x=0.9, y=0.98, size=15, family=font2, color="#224b5e") +
  draw_text(text="Electronics", x=0.11, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Fashion", x=0.28, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Food", x=0.45, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Health", x=0.62, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Home", x=0.79, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Sports", x=0.97, y=0.035, size=12, family=font2, fontface="bold", color="#fbf7f0") +
  draw_text(text="Branch A", x=0.96, y=0.84, size=20, family=font1, hjust=0, color="#224b5e") +
  draw_text(text="Branch B", x=0.96, y=0.505, size=20, family=font1, hjust=0, color="#224b5e") +
  draw_text(text="Branch C", x=0.96, y=0.17, size=20, family=font1, hjust=0, color="#224b5e")

It can be also interesting to see what are the products being bought between genders. Let’s check first the number of males and females in each branch.

Code

dataset |> 
  group_by(Branch, Gender) |> 
  summarise(n = n(), .groups = "drop") |> 
  group_by(Branch) |> 
  arrange(desc(Gender)) |> 
  mutate(proportion = paste0(round(n / sum(n) *100, 1), "%")) |> 
  mutate(ypos = cumsum(n) - 0.5*n ) |> 
  ggplot(aes(x = Branch, y = n, fill = Gender)) +
  geom_col() +
  scale_fill_manual(values= c("#8bdcc9",  "#fe957f")) +
  custom_theme() +
  geom_text(aes(y = ypos, label = proportion), color = "white", size = 5) +
  labs(y = "", title = "Gender summary per branch")

Code

data_product_gender <- dataset |> 
  mutate(
    date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, `Product line`, Gender) |>
  summarise(
    n_products_sold = sum(Quantity),
    .groups = "drop"
  )

Daily sales

Code

library(dplyr)
library(lubridate)

summary <- dataset |> 
  mutate(
    formatted_date = lubridate::mdy(Date)
  ) |> 
  group_by(Branch, formatted_date) |>
  summarise(
    sum_sales = sum(Total),
    .groups = "drop"
  )

summary |> 
  head() |> 
  kableExtra::kable("html")

Branch	formatted_date	sum_sales
A	2019-01-01	2371.3200
A	2019-01-02	307.0515
A	2019-01-03	937.4085
A	2019-01-04	483.2625
A	2019-01-05	2024.5050
A	2019-01-06	1309.9695

We can use ggplot2 for visualising the data

Code

library(ggplot2)
library(dplyr)
library(viridis)

# I'll load a custom theme

source("helper_functions.R")

temp_summary <- summary |> 
  mutate(name_branch = Branch)

temp_summary |> 
  ggplot(aes(x = formatted_date, y = sum_sales)) +
    geom_line( data=temp_summary %>% dplyr::select(-Branch), aes(group=name_branch), color="grey", size=0.5, alpha=0.5) +
    geom_line( aes(color = Branch), color="#69b3a2", size=1.2 )+
    scale_color_viridis(discrete = TRUE) +
    custom_theme() +
    theme(
      legend.position="none",
      plot.title = element_text(size=14),
      axis.title.y = element_blank(),
      axis.title.x = element_blank()
    ) +
    labs(
      title = "Daily Sales ($)",
      subtitle = "Daily sales in the first quarter of 2019 in three different branches",
      caption = "https://svalvaro.github.io/"
      
    ) + 
    facet_wrap(~Branch, ncol = 1)

Let’s add some interactivity and use the plotly library