*Each of these libraries are a collection of functions that can do extra functionality. Some were for running the test, others for collecting data, and others just for a prettier display

#These have functions to help me change columns, groupby, etc.
library(tidyverse)
library(car)

#Allows me to read in Excel files
library(readxl)

#These let me get the US Census Data
library(USAboundaries)
library(USAboundariesData)
library(sf)

#These make outputs of texts & datatables look nice
library(pander)
library(DT)
set.seed(2022)
redeem <- read_excel('../../Data/Redemptions4.xlsx',sheet='Redemptions',skip=1) %>%
  filter(!(Region == 'Total' | Market == 'Total' | Store=='Total' | Division=='Total')) %>%
  filter(!(Region == 'Franchise' | Market == 'Franchise' | Store=='Franchise' | Division=='Franchise'))
conversion <- read_excel('../../Data/Redemptions4.xlsx',sheet='Performance')
subs <- read_excel('../../Data/Redemptions4.xlsx',sheet='Subscriptions',skip=1)
locations <- read_excel('../../Data/Redemptions4.xlsx',sheet='Locations') %>%
  mutate(Store=as.character(store),
         city=str_to_title(city))

#Groups how many sites are in each city
city_pop <- as_tibble(us_cities()) %>%
  mutate(citystate=paste(city,';',state_abbr))

#Combines the Census data with our cities, then averages by site.
sites_per_city <- locations %>%
  group_by(city,state) %>%
  summarise(num=n()) %>%
  left_join(city_pop,by=c('city'='city','state'='state_abbr')) %>%
  mutate(population=ifelse(is.na(population),2000,population),
         pop_per_site=population/num,
         more_than_one = ifelse(num>1,1,0)) %>%
  select(city,state,pop_per_site,more_than_one)

#Joining the 5 sheets and Census data together
conv_red <- left_join(redeem,conversion,by=join_by('Store')) %>%
  #filter(Division.x %in% c('Florida','Arizona')) %>%
  left_join(subs,by=join_by('Store')) %>%
  left_join(locations,by=join_by('Store')) %>%
  left_join(sites_per_city,by=c('city'='city','state'='state')) %>%
  mutate(Store=as.numeric(Store),
         Subscriptions=oneMetricInt_3.y,
         Redemptions=oneMetricInt_3.x,
         goodConv = ifelse(Conversion>mean(Conversion),1,0),
         mashgin = ifelse(mashgin=='Yes',1,0),
         percSubscribe = Baskets/Subscriptions) #%>%
  #select(Store,Baskets,Offers,Upsells,Conversion,Redemptions,Subscriptions,goodConv,Division.x,pop_per_site,city,state)

A few months ago, Circle K ran a new loyalty program, where $5 could get you a voucher for a free drink (coffee, soda, or slushie) every day of the month. Because much of their customer base runs on near-daily customers, there was immediately a fairly strong adoption. But Circle K wanted to compare it to their other main loyalty program: LIFT upsell

\(~\) For the past couple years, when customers check out, the Circle K register will often automatically suggest an add-on to their purchase. Ordering a hot dog? Get a soda to go with it for $1! Stuff like that. The hypothesis is that stores that do a good job of implementing and using this current system, they’ll also do well at advertising the free drink program.

Y-Axes

We will use a linear regression model to see how well these two sales correlate. There are two potential Y-variables:

\(~\) The relationship between these two is pretty direct, almost exactly 16 uses per voucher, so it won’t matter too much which y-variables we use.

#Basic scatterplot and regression summary
plot(Redemptions~Subscriptions,data=sample_n(conv_red,100),main='Near-Perfect Correlation between New Subscribers and Uses')

y2 <- lm(Redemptions~Subscriptions,data=conv_red)
pander(y2)
Fitting linear model: Redemptions ~ Subscriptions
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -41.52 4.42 -9.395 8.098e-21
Subscriptions 16.01 0.05064 316.2 0

X Axes

Our core comparison is to Conversion, each store’s monthly rate of upsells. We expect that stores with high Conversion scores will also have a high Subscriptions score, because the customers that come here do so frequently and already interact with the other programs that Circle K has to offer. Some other variables we can use to categorize our stores:

  • Which of the 13 major parts of the country they are: Division
  • How many transactions they have on an average day: Basket
  • The population of the town they’re from: pop_per_site (if there are multiple, the population is divided between them. If no Census info was found, we defaulted to 2,000)
  • Whether or not they have an automated check-out system in place, because this will impact how much the sales clerk interacts with them at checkout: mashgin
  • Whether or not there are multiple Circle K’s in that city: more_than_one

If we build a model using each of these factors and every interaction, we end with an \(R^2=0.4888\), but with so many different factors we risk over fitting. Instead, let’s build from the ground up with the relevant pieces.

\(~\) My hypothesis is that each of these factors will have an impact on the total Subscriptions score: \(H_a: \beta_{1...19} \neq 0\)

#This lm function will automatically make dummy columns for each categorical variable in Baskets. The default is the first alphabetically, which happens to be Arizona, and then every value is the change from Arizona

best <- lm(Subscriptions~Baskets+Conversion+
     Baskets:Division.x+
     Baskets*mashgin+
     more_than_one+
     pop_per_site,data=conv_red)

#Pander makes it look good for the html
pander(summary(best))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.91 3.851 5.43 5.878e-08
Baskets 0.005203 0.0001262 41.23 0
Conversion -95.34 15.07 -6.326 2.702e-10
mashgin 43.33 3.533 12.27 3.752e-34
more_than_one -8.052 1.322 -6.089 1.212e-09
pop_per_site -4.753e-05 1.278e-05 -3.719 0.000202
Baskets:Division.xCoastal Carolina -0.003056 0.0001302 -23.47 2.702e-116
Baskets:Division.xFlorida -0.002255 0.0001314 -17.16 2.256e-64
Baskets:Division.xGreat Lakes -0.001179 0.0001443 -8.165 3.917e-16
Baskets:Division.xGulf Coast -0.003408 0.0001274 -26.75 1.533e-148
Baskets:Division.xHeartland 0.0005714 0.0001578 3.622 0.0002953
Baskets:Division.xMidwest -0.0001945 0.000139 -1.4 0.1617
Baskets:Division.xNorthern Tier -0.001363 0.0001222 -11.15 1.375e-28
Baskets:Division.xRocky Mountain -0.002411 0.0001646 -14.65 9.678e-48
Baskets:Division.xSouth Atlantic -0.003434 0.0001292 -26.59 8.257e-147
Baskets:Division.xSoutheast -0.002938 0.000137 -21.44 3.943e-98
Baskets:Division.xTexas -0.00311 0.0001153 -26.98 6.494e-151
Baskets:Division.xWest Coast 0.0002713 0.0001735 1.564 0.1179
Baskets:mashgin -0.0008916 0.0001976 -4.512 6.555e-06
Fitting linear model: Subscriptions ~ Baskets + Conversion + Baskets:Division.x + Baskets * mashgin + more_than_one + pop_per_site
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
5655 41.69 0.3924 0.3905

All of our main factors are statistically significant at the \(p<.001\) level, except for a couple divisions that were very similar to others. We can build our equation of the model like this:

\[ \text{Expected Subscriptions} = 20.9+.005*\text{Baskets}-90.53*\text{Conversion %}+\\ 43.33 \text{ if Machine Checkout} -8 \text{ if More than One Site Per City} -\\ .4*\text{City Population in Thousands} \]

We get some unexpected results: New subscriptions are negatively correlated with pre-existing conversion rates. It looks like Baskets and subscriptions are positively correlated though, and it makes sense that stores with lots of traffic will have more people using the new loyalty program.

\(~\) If we standardize for the Division & average Basket size, we can see the impact that each of our smaller factors: stores in the same city and the self-checkout machine.

#Manually order which colors occur
palette(c('black','blue','yellow','green'))

#Sample dataset of 100, so it's easier to see
visualize_100 <- conv_red %>%
  filter(Division.x=='Arizona')%>%
  sample_n(200)

#Get the values I can use to draw regression curves
b<- best$coefficients
avg_basket <- median(visualize_100$Baskets)
avg_pop <- median(visualize_100$pop_per_site)


plot(Subscriptions~Conversion,data=visualize_100,
     col=interaction(as.factor(more_than_one),as.factor(mashgin)),pch=16,main='Subset of 100 Stores: What Binary Factors have an Impact?')
legend('topright',legend=c("Add Yellow=Auto-checkout","Add Blue=Multiple in the City"))

#Runs the curve() function 4 times, adding each line based on the coefficients from the regression and the default values from above
for(machine in seq(0,1,1)){
  for (multiple in seq(0,1,1)){
    curve(b[1]+b[3]*x+(b[2]+b[19]*machine)*avg_basket+b[4]*machine+b[5]*multiple+b[6]*avg_pop,
          add=TRUE,col=palette()[2*machine+multiple+1]) #Get the color, and add it to the original plot
  }
}

Stores in yellow have an auto-checkout system, and we see that they score better than stores without one under the new loyalty system. This seems counter-intuitive, and we don’t know if the machine causes that increase or if the stores that got the machines earlier were already getting more engagement with the programs. It might be that without the pressure of an in-person suggestion, people are willing to take the offer.

\(~\) Stores in towns with multiple Circle K’s see lower new subscriptions than those in urban areas. This has a much smaller impact than the self-checkout system, and I don’t know why that might have an impact. This could be a place to do further research.

3D Plot

For the 3D Plot we’re going to just look at the base model: Arizona sites without a machine and are the only one in the city.

#I barely know how this works myself
library(plotly)
library(reshape2)

#Perform the multiple regression
spherek_lm <- lm(Subscriptions~Conversion+Baskets, data= visualize_100)

#Graph Resolution (more important for more complex shapes)
graph_reso <- 5

#Setup Axis
axis_x <- seq(min(conv_red$Conversion,na.rm=TRUE), max(conv_red$Conversion,na.rm=TRUE), by = 0.05)
axis_y <- seq(min(conv_red$Baskets,na.rm=TRUE), max(conv_red$Baskets,na.rm=TRUE), by = 1000)

#Sample points
getSubs <- expand.grid(Conversion = axis_x, Baskets = axis_y, KEEP.OUT.ATTRS=F)
getSubs$Z <- predict.lm(spherek_lm, newdata = getSubs)
getSubs <- acast(getSubs, Baskets ~ Conversion, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(visualize_100, 
        x = ~Conversion, 
        y = ~Baskets, 
        z = ~Subscriptions,
        text = rownames(visualize_100), 
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = getSubs,
            x = axis_x,
            y = axis_y,
            type = "surface")

Conclusion

There are definitely some differences between how stores have implemented the LIFT loyalty program, and how they should best implement this new loyalty program. Embracing new technology like the self-checkout and studying some exemplary sites will go a long way. Since these two have contrasting outcomes, it might be worth studying whether or not sites should implement both programs, or just one, depending on their current success wiht LIFT.

Location Didn’t seem to Matter

I had a blast using Census data to match towns with their populations and number of Circle K’s, but it seemed to be a near-0 predictor of Redemptions. We can see these thick lines were dozens of stores are all in the same city like Phoenix or Mesa. An \(R^2\) of .01 isn’t worth studying.

sample_stores <- filter(conv_red,state=='AZ')
par(mfrow=c(1,2))
plot(Redemptions~pop_per_site,data=sample_stores,log='x')
plot(Subscriptions~pop_per_site,data=sample_stores,log='x')

pander(summary(lm(Subscriptions~pop_per_site,data=sample_stores)))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 90.08 6.903 13.05 2.44e-34
pop_per_site 0.002329 0.0007426 3.137 0.001792
Fitting linear model: Subscriptions ~ pop_per_site
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
587 68.35 0.01654 0.01486

Appendix

Here’s code I didn’t use, but I didn’t want to get rid of it

#New Y
simple2 <- lm(percSubscribe~Conversion+pop_per_site+mashgin+more_than_one,data=conv_red)
summary(simple2)
b <-simple2$coefficients
#plot(percSubscribe~Conversion,data=visualize_100,log='y',)
plot(percSubscribe~pop_per_site,data=visualize_100,log='y',
     col=interaction(as.factor(more_than_one),as.factor(mashgin)),pch=16,main='Subset of 100 Stores')
legend('topright',legend=c("Add Yellow=Auto-checkout","Add Blue=Multiple in the City"))
avg_pop=median(visualize_100$pop_per_site)
for(machine in seq(0,1,1)){
  for (multiple in seq(0,1,1)){
    curve(b[1]+b[2]*x+b[3]*avg_pop+b[4]*machine+b[5]*multiple,add=TRUE,col=palette()[2*machine+multiple+1])
  }
}
plot(Conversion~oneMetricInt_3,data=conv_red,col=as.factor(Region.x))
hist(conv_red$Conversion)
pairs(conv_red,panel=panel.smooth)
hist(conv_red$oneMetricInt_3.x/conv_red$oneMetricInt_3.y)
mean(conv_red$Conversion)

try <- lm(Redemptions~Subscriptions+I(Subscriptions^2),data=conv_red)
summary(try)
plot(Redemptions~Subscriptions,data=conv_red)
abline(try)


plot(Subscriptions~Conversion,data=conv_red)
plot(Subscriptions~Baskets,data=conv_red)
bc <- lm(Subscriptions~Baskets*more_than_one,data=conv_red)
summary(bc)
#abline(lm(Subscriptions~Conversion,data=conv_red))

plot(Subscriptions~pop_per_site,data=conv_red)
bc <- lm(Subscriptions~Baskets*Conversion*pop_per_site,data=conv_red)
summary(bc)
bcp <- lm(Redemptions~Baskets+Conversion+pop_per_site+Baskets:Conversion+Conversion:pop_per_site,data=conv_red)
summary(bcp)
summary(lm(Redemptions~as.factor(Division.x),data=conv_red))
library(plotly)
library(reshape2)

#Perform the multiple regression
spherek_lm <- lm(percSubscribe~Conversion+pop_per_site, data= visualize_100)

#Graph Resolution (more important for more complex shapes)
graph_reso <- 5

#Setup Axis
axis_x <- seq(min(conv_red$Conversion,na.rm=TRUE), max(conv_red$Conversion,na.rm=TRUE), by = 0.05)
axis_y <- seq(min(conv_red$pop_per_site,na.rm=TRUE), max(conv_red$pop_per_site,na.rm=TRUE), by = 100)

#Sample points
getSubs <- expand.grid(Conversion = axis_x, pop_per_site = axis_y, KEEP.OUT.ATTRS=F)
getSubs$Z <- predict.lm(spherek_lm, newdata = getSubs)
getSubs <- acast(getSubs, pop_per_site ~ Conversion, value.var = "Z") #y ~ x

#Create scatterplot
plot_ly(visualize_100, 
        x = ~Conversion, 
        y = ~pop_per_site, 
        z = ~percSubscribe,
        text = rownames(visualize_100), 
        type = "scatter3d", 
        mode = "markers") %>%
  add_trace(z = getSubs,
            x = axis_x,
            y = axis_y,
            type = "surface")