*Each of these libraries are a collection of functions that can do extra functionality. Some were for running the test, others for collecting data, and others just for a prettier display
#These have functions to help me change columns, groupby, etc.
library(tidyverse)
library(car)
#Allows me to read in Excel files
library(readxl)
#These let me get the US Census Data
library(USAboundaries)
library(USAboundariesData)
library(sf)
#These make outputs of texts & datatables look nice
library(pander)
library(DT)
set.seed(2022)
redeem <- read_excel('../../Data/Redemptions4.xlsx',sheet='Redemptions',skip=1) %>%
filter(!(Region == 'Total' | Market == 'Total' | Store=='Total' | Division=='Total')) %>%
filter(!(Region == 'Franchise' | Market == 'Franchise' | Store=='Franchise' | Division=='Franchise'))
conversion <- read_excel('../../Data/Redemptions4.xlsx',sheet='Performance')
subs <- read_excel('../../Data/Redemptions4.xlsx',sheet='Subscriptions',skip=1)
locations <- read_excel('../../Data/Redemptions4.xlsx',sheet='Locations') %>%
mutate(Store=as.character(store),
city=str_to_title(city))
#Groups how many sites are in each city
city_pop <- as_tibble(us_cities()) %>%
mutate(citystate=paste(city,';',state_abbr))
#Combines the Census data with our cities, then averages by site.
sites_per_city <- locations %>%
group_by(city,state) %>%
summarise(num=n()) %>%
left_join(city_pop,by=c('city'='city','state'='state_abbr')) %>%
mutate(population=ifelse(is.na(population),2000,population),
pop_per_site=population/num,
more_than_one = ifelse(num>1,1,0)) %>%
select(city,state,pop_per_site,more_than_one)
#Joining the 5 sheets and Census data together
conv_red <- left_join(redeem,conversion,by=join_by('Store')) %>%
#filter(Division.x %in% c('Florida','Arizona')) %>%
left_join(subs,by=join_by('Store')) %>%
left_join(locations,by=join_by('Store')) %>%
left_join(sites_per_city,by=c('city'='city','state'='state')) %>%
mutate(Store=as.numeric(Store),
Subscriptions=oneMetricInt_3.y,
Redemptions=oneMetricInt_3.x,
goodConv = ifelse(Conversion>mean(Conversion),1,0),
mashgin = ifelse(mashgin=='Yes',1,0),
percSubscribe = Baskets/Subscriptions) #%>%
#select(Store,Baskets,Offers,Upsells,Conversion,Redemptions,Subscriptions,goodConv,Division.x,pop_per_site,city,state)
A few months ago, Circle K ran a new loyalty program, where $5 could get you a voucher for a free drink (coffee, soda, or slushie) every day of the month. Because much of their customer base runs on near-daily customers, there was immediately a fairly strong adoption. But Circle K wanted to compare it to their other main loyalty program: LIFT upsell
\(~\) For the past couple years, when customers check out, the Circle K register will often automatically suggest an add-on to their purchase. Ordering a hot dog? Get a soda to go with it for $1! Stuff like that. The hypothesis is that stores that do a good job of implementing and using this current system, they’ll also do well at advertising the free drink program.
We will use a linear regression model to see how well these two sales correlate. There are two potential Y-variables:
Subscriptions
Redemptions
\(~\) The relationship between these two is pretty direct, almost exactly 16 uses per voucher, so it won’t matter too much which y-variables we use.
#Basic scatterplot and regression summary
plot(Redemptions~Subscriptions,data=sample_n(conv_red,100),main='Near-Perfect Correlation between New Subscribers and Uses')
y2 <- lm(Redemptions~Subscriptions,data=conv_red)
pander(y2)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -41.52 | 4.42 | -9.395 | 8.098e-21 |
Subscriptions | 16.01 | 0.05064 | 316.2 | 0 |
Our core comparison is to Conversion
, each store’s
monthly rate of upsells. We expect that stores with high
Conversion
scores will also have a high
Subscriptions
score, because the customers that come here
do so frequently and already interact with the other programs that
Circle K has to offer. Some other variables we can use to categorize our
stores:
Division
Basket
pop_per_site
(if there are multiple, the population is divided between them. If no
Census info was found, we defaulted to 2,000)mashgin
more_than_one
If we build a model using each of these factors and every interaction, we end with an \(R^2=0.4888\), but with so many different factors we risk over fitting. Instead, let’s build from the ground up with the relevant pieces.
\(~\) My hypothesis is that each of these factors will have an impact on the total Subscriptions score: \(H_a: \beta_{1...19} \neq 0\)
#This lm function will automatically make dummy columns for each categorical variable in Baskets. The default is the first alphabetically, which happens to be Arizona, and then every value is the change from Arizona
best <- lm(Subscriptions~Baskets+Conversion+
Baskets:Division.x+
Baskets*mashgin+
more_than_one+
pop_per_site,data=conv_red)
#Pander makes it look good for the html
pander(summary(best))
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 20.91 | 3.851 | 5.43 | 5.878e-08 |
Baskets | 0.005203 | 0.0001262 | 41.23 | 0 |
Conversion | -95.34 | 15.07 | -6.326 | 2.702e-10 |
mashgin | 43.33 | 3.533 | 12.27 | 3.752e-34 |
more_than_one | -8.052 | 1.322 | -6.089 | 1.212e-09 |
pop_per_site | -4.753e-05 | 1.278e-05 | -3.719 | 0.000202 |
Baskets:Division.xCoastal Carolina | -0.003056 | 0.0001302 | -23.47 | 2.702e-116 |
Baskets:Division.xFlorida | -0.002255 | 0.0001314 | -17.16 | 2.256e-64 |
Baskets:Division.xGreat Lakes | -0.001179 | 0.0001443 | -8.165 | 3.917e-16 |
Baskets:Division.xGulf Coast | -0.003408 | 0.0001274 | -26.75 | 1.533e-148 |
Baskets:Division.xHeartland | 0.0005714 | 0.0001578 | 3.622 | 0.0002953 |
Baskets:Division.xMidwest | -0.0001945 | 0.000139 | -1.4 | 0.1617 |
Baskets:Division.xNorthern Tier | -0.001363 | 0.0001222 | -11.15 | 1.375e-28 |
Baskets:Division.xRocky Mountain | -0.002411 | 0.0001646 | -14.65 | 9.678e-48 |
Baskets:Division.xSouth Atlantic | -0.003434 | 0.0001292 | -26.59 | 8.257e-147 |
Baskets:Division.xSoutheast | -0.002938 | 0.000137 | -21.44 | 3.943e-98 |
Baskets:Division.xTexas | -0.00311 | 0.0001153 | -26.98 | 6.494e-151 |
Baskets:Division.xWest Coast | 0.0002713 | 0.0001735 | 1.564 | 0.1179 |
Baskets:mashgin | -0.0008916 | 0.0001976 | -4.512 | 6.555e-06 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
5655 | 41.69 | 0.3924 | 0.3905 |
All of our main factors are statistically significant at the \(p<.001\) level, except for a couple divisions that were very similar to others. We can build our equation of the model like this:
\[ \text{Expected Subscriptions} = 20.9+.005*\text{Baskets}-90.53*\text{Conversion %}+\\ 43.33 \text{ if Machine Checkout} -8 \text{ if More than One Site Per City} -\\ .4*\text{City Population in Thousands} \]
We get some unexpected results: New subscriptions are negatively correlated with pre-existing conversion rates. It looks like Baskets and subscriptions are positively correlated though, and it makes sense that stores with lots of traffic will have more people using the new loyalty program.
\(~\) If we standardize for the Division & average Basket size, we can see the impact that each of our smaller factors: stores in the same city and the self-checkout machine.
#Manually order which colors occur
palette(c('black','blue','yellow','green'))
#Sample dataset of 100, so it's easier to see
visualize_100 <- conv_red %>%
filter(Division.x=='Arizona')%>%
sample_n(200)
#Get the values I can use to draw regression curves
b<- best$coefficients
avg_basket <- median(visualize_100$Baskets)
avg_pop <- median(visualize_100$pop_per_site)
plot(Subscriptions~Conversion,data=visualize_100,
col=interaction(as.factor(more_than_one),as.factor(mashgin)),pch=16,main='Subset of 100 Stores: What Binary Factors have an Impact?')
legend('topright',legend=c("Add Yellow=Auto-checkout","Add Blue=Multiple in the City"))
#Runs the curve() function 4 times, adding each line based on the coefficients from the regression and the default values from above
for(machine in seq(0,1,1)){
for (multiple in seq(0,1,1)){
curve(b[1]+b[3]*x+(b[2]+b[19]*machine)*avg_basket+b[4]*machine+b[5]*multiple+b[6]*avg_pop,
add=TRUE,col=palette()[2*machine+multiple+1]) #Get the color, and add it to the original plot
}
}
Stores in yellow have an auto-checkout system, and we see that they score better than stores without one under the new loyalty system. This seems counter-intuitive, and we don’t know if the machine causes that increase or if the stores that got the machines earlier were already getting more engagement with the programs. It might be that without the pressure of an in-person suggestion, people are willing to take the offer.
\(~\) Stores in towns with multiple Circle K’s see lower new subscriptions than those in urban areas. This has a much smaller impact than the self-checkout system, and I don’t know why that might have an impact. This could be a place to do further research.
For the 3D Plot we’re going to just look at the base model: Arizona sites without a machine and are the only one in the city.
#I barely know how this works myself
library(plotly)
library(reshape2)
#Perform the multiple regression
spherek_lm <- lm(Subscriptions~Conversion+Baskets, data= visualize_100)
#Graph Resolution (more important for more complex shapes)
graph_reso <- 5
#Setup Axis
axis_x <- seq(min(conv_red$Conversion,na.rm=TRUE), max(conv_red$Conversion,na.rm=TRUE), by = 0.05)
axis_y <- seq(min(conv_red$Baskets,na.rm=TRUE), max(conv_red$Baskets,na.rm=TRUE), by = 1000)
#Sample points
getSubs <- expand.grid(Conversion = axis_x, Baskets = axis_y, KEEP.OUT.ATTRS=F)
getSubs$Z <- predict.lm(spherek_lm, newdata = getSubs)
getSubs <- acast(getSubs, Baskets ~ Conversion, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(visualize_100,
x = ~Conversion,
y = ~Baskets,
z = ~Subscriptions,
text = rownames(visualize_100),
type = "scatter3d",
mode = "markers") %>%
add_trace(z = getSubs,
x = axis_x,
y = axis_y,
type = "surface")
There are definitely some differences between how stores have implemented the LIFT loyalty program, and how they should best implement this new loyalty program. Embracing new technology like the self-checkout and studying some exemplary sites will go a long way. Since these two have contrasting outcomes, it might be worth studying whether or not sites should implement both programs, or just one, depending on their current success wiht LIFT.
I had a blast using Census data to match towns with their populations and number of Circle K’s, but it seemed to be a near-0 predictor of Redemptions. We can see these thick lines were dozens of stores are all in the same city like Phoenix or Mesa. An \(R^2\) of .01 isn’t worth studying.
sample_stores <- filter(conv_red,state=='AZ')
par(mfrow=c(1,2))
plot(Redemptions~pop_per_site,data=sample_stores,log='x')
plot(Subscriptions~pop_per_site,data=sample_stores,log='x')
pander(summary(lm(Subscriptions~pop_per_site,data=sample_stores)))
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 90.08 | 6.903 | 13.05 | 2.44e-34 |
pop_per_site | 0.002329 | 0.0007426 | 3.137 | 0.001792 |
Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
---|---|---|---|
587 | 68.35 | 0.01654 | 0.01486 |
Here’s code I didn’t use, but I didn’t want to get rid of it
#New Y
simple2 <- lm(percSubscribe~Conversion+pop_per_site+mashgin+more_than_one,data=conv_red)
summary(simple2)
b <-simple2$coefficients
#plot(percSubscribe~Conversion,data=visualize_100,log='y',)
plot(percSubscribe~pop_per_site,data=visualize_100,log='y',
col=interaction(as.factor(more_than_one),as.factor(mashgin)),pch=16,main='Subset of 100 Stores')
legend('topright',legend=c("Add Yellow=Auto-checkout","Add Blue=Multiple in the City"))
avg_pop=median(visualize_100$pop_per_site)
for(machine in seq(0,1,1)){
for (multiple in seq(0,1,1)){
curve(b[1]+b[2]*x+b[3]*avg_pop+b[4]*machine+b[5]*multiple,add=TRUE,col=palette()[2*machine+multiple+1])
}
}
plot(Conversion~oneMetricInt_3,data=conv_red,col=as.factor(Region.x))
hist(conv_red$Conversion)
pairs(conv_red,panel=panel.smooth)
hist(conv_red$oneMetricInt_3.x/conv_red$oneMetricInt_3.y)
mean(conv_red$Conversion)
try <- lm(Redemptions~Subscriptions+I(Subscriptions^2),data=conv_red)
summary(try)
plot(Redemptions~Subscriptions,data=conv_red)
abline(try)
plot(Subscriptions~Conversion,data=conv_red)
plot(Subscriptions~Baskets,data=conv_red)
bc <- lm(Subscriptions~Baskets*more_than_one,data=conv_red)
summary(bc)
#abline(lm(Subscriptions~Conversion,data=conv_red))
plot(Subscriptions~pop_per_site,data=conv_red)
bc <- lm(Subscriptions~Baskets*Conversion*pop_per_site,data=conv_red)
summary(bc)
bcp <- lm(Redemptions~Baskets+Conversion+pop_per_site+Baskets:Conversion+Conversion:pop_per_site,data=conv_red)
summary(bcp)
summary(lm(Redemptions~as.factor(Division.x),data=conv_red))
library(plotly)
library(reshape2)
#Perform the multiple regression
spherek_lm <- lm(percSubscribe~Conversion+pop_per_site, data= visualize_100)
#Graph Resolution (more important for more complex shapes)
graph_reso <- 5
#Setup Axis
axis_x <- seq(min(conv_red$Conversion,na.rm=TRUE), max(conv_red$Conversion,na.rm=TRUE), by = 0.05)
axis_y <- seq(min(conv_red$pop_per_site,na.rm=TRUE), max(conv_red$pop_per_site,na.rm=TRUE), by = 100)
#Sample points
getSubs <- expand.grid(Conversion = axis_x, pop_per_site = axis_y, KEEP.OUT.ATTRS=F)
getSubs$Z <- predict.lm(spherek_lm, newdata = getSubs)
getSubs <- acast(getSubs, pop_per_site ~ Conversion, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(visualize_100,
x = ~Conversion,
y = ~pop_per_site,
z = ~percSubscribe,
text = rownames(visualize_100),
type = "scatter3d",
mode = "markers") %>%
add_trace(z = getSubs,
x = axis_x,
y = axis_y,
type = "surface")