Packages

library(bpcs)
library(tidyverse)
library(knitr)
library(kableExtra)
options(mc.cores = parallel::detectCores())
rstan::rstan_options(auto_write = TRUE)
set.seed(99)

Gorillas using touchscreen

Huskisson, S.M., Jacobson, S.L., Egelkamp, C.L. et al. Using a Touchscreen Paradigm to Evaluate Food Preferences and Response to Novel Photographic Stimuli of Food in Three Primate Species (Gorilla gorilla gorilla, Pan troglodytes, and Macaca fuscata). Int J Primatol 41, 5–23 (2020). https://doi.org/10.1007/s10764-020-00131-0

Let’s import the data and select only the gorillas

d <- readr::read_csv('data/touchscreen.csv') %>% 
  dplyr::filter(species_type=='Gorilla') %>% 
  dplyr::select(species_type, SubjectCode, Trial, image1, image2, image_chosen)

Let’s create a column that says which image was selected 1 for image2 0 for image 1

d <- d %>% 
  dplyr::mutate(y =ifelse(image_chosen==image1, 0, 1)) %>% 
  select(-image_chosen)

Let’s also do some recoding to facilitate interpreting

d$image1 <- dplyr::recode(d$image1, "Ca" = 'Carrot')
d$image1 <- dplyr::recode(d$image1, "Cu" = 'Cucumber')
d$image1 <- dplyr::recode(d$image1, "Tu" = 'Turnip')
d$image1 <- dplyr::recode(d$image1, "Gr" = 'Grape')
d$image1 <- dplyr::recode(d$image1, "To" = 'Tomato')
d$image1 <- dplyr::recode(d$image1, "Ap" = 'Apple')

d$image2 <- dplyr::recode(d$image2, "Ca" = 'Carrot')
d$image2 <- dplyr::recode(d$image2, "Cu" = 'Cucumber')
d$image2 <- dplyr::recode(d$image2, "Tu" = 'Turnip')
d$image2 <- dplyr::recode(d$image2, "Gr" = 'Grape')
d$image2 <- dplyr::recode(d$image2, "To" = 'Tomato')
d$image2 <- dplyr::recode(d$image2, "Ap" = 'Apple')

Sample of the data

d %>% 
  dplyr::sample_n(size=10) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")
species_type SubjectCode Trial image1 image2 y
Gorilla Gorilla3 2 Grape Cucumber 0
Gorilla Gorilla6 16 Grape Turnip 0
Gorilla Gorilla6 14 Grape Tomato 1
Gorilla Gorilla2 2 Apple Tomato 0
Gorilla Gorilla4 14 Apple Turnip 0
Gorilla Gorilla4 5 Carrot Grape 0
Gorilla Gorilla5 13 Tomato Turnip 0
Gorilla Gorilla5 13 Apple Carrot 0
Gorilla Gorilla6 21 Carrot Grape 1
Gorilla Gorilla5 28 Apple Turnip 0

Now we can run a Bradley-Terry model with the bpcs package

m1_gor <-
  bpc(
    d,
    player0 = 'image1',
    player1 = 'image2',
    result_column = 'y',
    model_type = 'bt',
    priors = list(prior_lambda_std = 0.5),
    iter = 3000
  )
save_bpc_model(m1_gor, 'm1_gor', 'fittedmodels')

Now that we got our posterior we can look at some interesting tables and plots

plot(m1_gor, rotate_x_labels = T)+
  scale_x_discrete(
  labels = c(
    "Turnip",
    "Cucumber",
    "Carrot",
    "Apple",
    "Grape",
    "Tomato"
  ))

Getting a probability table

get_probabilities_table(m1_gor, format='html')
Estimated posterior probabilites
i j i_beats_j j_beats_i
Apple Carrot 0.54 0.46
Apple Cucumber 0.68 0.32
Apple Grape 0.36 0.64
Apple Tomato 0.39 0.61
Apple Turnip 0.74 0.26
Carrot Cucumber 0.64 0.36
Carrot Grape 0.25 0.75
Carrot Tomato 0.26 0.74
Carrot Turnip 0.73 0.27
Cucumber Grape 0.15 0.85
Cucumber Tomato 0.17 0.83
Cucumber Turnip 0.51 0.49
Grape Tomato 0.58 0.42
Grape Turnip 0.83 0.17
Tomato Turnip 0.83 0.17

Automated labeling

Let’s start importing the data

d2 <- read_csv('./data/auto-label-comparison.csv') %>%
  dplyr::rename(Value='value') %>% 
  dplyr::filter(ValueType=='accuracy') %>% 
  dplyr::filter(Model=='LabelPropagation_knn' | 
                Model=='LabelPropagation_rbf' |
                Model=='LabelSpreading_knn' |
                Model=='LabelSpreading_rbf')
## Warning: Missing column names filled in: 'X1' [1]

Lets visualize

d2 %>% 
  dplyr::sample_n(size=10) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")
X1 ValueType Value Iteration Variable Model Dataset DataType
27 accuracy 0.6967609 7 50% LabelSpreading_rbf digits image
22 accuracy 0.3333333 2 50% LabelSpreading_knn corpus text
23 accuracy 0.1666667 3 50% LabelPropagation_knn reuters text
8 accuracy 0.0181818 8 10% LabelPropagation_rbf cifar image
28 accuracy 0.0004808 8 50% LabelPropagation_rbf cifar2 image
24 accuracy 0.2756910 4 50% LabelPropagation_knn german numeric
4 accuracy 0.0181818 4 10% LabelPropagation_knn mnist image
9 accuracy 0.1666667 9 10% LabelPropagation_knn reuters text
1 accuracy 0.1280959 1 10% LabelPropagation_rbf 20news text
20 accuracy 0.2635986 0 50% LabelPropagation_knn musk1 numeric

Now we need to convert it to paired comparison: First we pivot it longer and them we expand the dataset

d_acc_rank <- d2 %>% 
  dplyr::group_by(Dataset, Variable, Iteration) %>%
  dplyr::mutate(Rank=-rank(Value, ties.method = 'random')) %>% 
  dplyr::ungroup() %>% 
  dplyr::select(-Value) %>% #we need to drop the Value variable to pivot wider
  tidyr::pivot_wider(names_from = Model,
                     values_from=Rank)

Expanding it

models<-unique(d2$Model)
n_models = length(models)
comb <- gtools::combinations(n=n_models, r=2, v=seq(1:n_models), repeats.allowed = F) #all teh paired combinations

d_acc_bt <-  dplyr::tribble(~model0, ~model1, ~y, ~Iteration, ~Dataset, ~DatasetType)

#now we loop each row of the rank wide dataset and create a new one
for(i in 1:nrow(d_acc_rank))
{
  current_row <- d_acc_rank[i,]
  for(j in 1:nrow(comb)){
    comb_row <- comb[j,]
    
    model0 <- models[comb_row[1]]
    model0_rank <- current_row[[1,model0]]
    
    model1 <- models[comb_row[2]]
    model1_rank <- current_row[[1,model1]]
    
    diff_rank <- model1_rank - model0_rank
    #SInce higher accuracy is better if model 1 rank- model 0 rank is positive than model1 wins and y=1 else y=0
    y <- ifelse(diff_rank>0, 1, 0) 
    
    
    d_acc_bt <-d_acc_bt %>%  
      add_row(model0=model0,
              model1=model1,
              y=y,
              Iteration=current_row$Iteration,
              Dataset=current_row$Dataset,
              DatasetType=current_row$DataType)
    
  }
}
d_acc_bt %>% 
  dplyr::sample_n(size=20) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")
model0 model1 y Iteration Dataset DatasetType
LabelPropagation_rbf LabelSpreading_rbf 1 0 wine numeric
LabelSpreading_knn LabelSpreading_rbf 0 8 wine numeric
LabelPropagation_rbf LabelSpreading_knn 0 5 corpus text
LabelPropagation_rbf LabelSpreading_rbf 0 6 wine numeric
LabelPropagation_knn LabelPropagation_rbf 1 3 cifar image
LabelPropagation_rbf LabelSpreading_rbf 0 0 corpus text
LabelPropagation_knn LabelSpreading_rbf 0 7 20news text
LabelPropagation_knn LabelSpreading_rbf 0 8 reuters text
LabelPropagation_rbf LabelSpreading_knn 0 5 musk1 numeric
LabelSpreading_knn LabelSpreading_rbf 1 9 cifar image
LabelPropagation_knn LabelSpreading_rbf 0 1 20news text
LabelPropagation_rbf LabelSpreading_rbf 1 4 ohsumed text
LabelSpreading_knn LabelSpreading_rbf 1 5 cifar image
LabelSpreading_knn LabelSpreading_rbf 0 1 mnist image
LabelSpreading_knn LabelSpreading_rbf 1 9 iris numeric
LabelPropagation_knn LabelSpreading_knn 0 3 corpus text
LabelPropagation_rbf LabelSpreading_rbf 0 9 cifar2 image
LabelPropagation_knn LabelSpreading_knn 1 8 digits image
LabelPropagation_rbf LabelSpreading_knn 1 9 ohsumed text
LabelPropagation_knn LabelSpreading_knn 1 1 corpus text

Now we can do some paired comparison models

m1_autol <-
  bpc(
    d_acc_bt,
    player0 = 'model0',
    player1 = 'model1',
    result_column = 'y',
    model_type = 'bt-U',
    cluster = c("Dataset"),
    priors = list(prior_lambda_std = 0.5, prior_U1_std=1.0),
    iter = 3000
  )
save_bpc_model(m1_autol, 'm1_autol', 'fittedmodels')
get_probabilities_table(m1_autol, format = 'html', model_type = 'bt')
Estimated posterior probabilites
i j i_beats_j j_beats_i
LabelPropagation_knn LabelPropagation_rbf 0.33 0.67
LabelPropagation_knn LabelSpreading_knn 0.45 0.55
LabelPropagation_knn LabelSpreading_rbf 0.45 0.55
LabelPropagation_rbf LabelSpreading_knn 0.62 0.38
LabelPropagation_rbf LabelSpreading_rbf 0.56 0.44
LabelSpreading_knn LabelSpreading_rbf 0.41 0.59
get_rank_of_players_table(m1_autol, format = 'html')
Estimated posterior ranks
Parameter MedianRank MeanRank StdRank
lambda[LabelPropagation_rbf] 1 1.426 0.756
lambda[LabelPropagation_knn] 3 2.700 0.976
lambda[LabelSpreading_knn] 3 2.812 0.959
lambda[LabelSpreading_rbf] 3 3.062 0.976
plot(m1_autol, rotate_x_labels = T)+
  scale_x_discrete(
  labels = c(
    "LS-RBF",
    "LS-KNN",
    "LP-KNN",
    "LP-RBF"
  ))

Football

The data comes in the bpcs package

data("brasil_soccer_league")
knitr::kable(head(brasil_soccer_league))
Time WeekDay Date HomeTeam VisitorTeam Round Stadium ScoreHomeTeam ScoreVisitorTeam
16:00 Saturday 2017-05-13 Flamengo Atlético-MG 1 Maracanã 1 1
19:00 Saturday 2017-05-13 Corinthians Chapecoense 1 Arena Corinthians 1 1
16:00 Sunday 2017-05-14 Avaí Vitória 1 Ressacada 0 0
16:00 Sunday 2017-05-14 Bahia Athlético-PR 1 Fonte Nova 6 2
16:00 Sunday 2017-05-14 Cruzeiro São Paulo 1 Mineirão 1 0
11:00 Sunday 2017-05-14 Fluminense Santos 1 Maracanã 3 2

Let’s analyze only the data from 2019 and remove a few columns that are not relevant for this example:

d3 <- brasil_soccer_league %>%
  dplyr::filter(Date >= as.Date("2019-01-01") &
                  Date <= as.Date("2019-12-31")) %>%
  dplyr::select(HomeTeam, VisitorTeam, ScoreHomeTeam, ScoreVisitorTeam, Round) 
m_football <- bpc(d3, 
          player0 = 'VisitorTeam', 
          player1 =  'HomeTeam', 
          player0_score = 'ScoreVisitorTeam',
          player1_score = 'ScoreHomeTeam',
          model_type = 'davidson',
          solve_ties = 'none',
          priors = list(prior_lambda_std=2.0), # making a more informative prior to improve convergence
          iter = 3000) #stan indicates a low bulk ESS so we are increasing the number of iterations
save_bpc_model(m_football, 'm_football', 'fittedmodels')

Getting the table with the parameters:

get_parameters_table(m_football, format = 'html')
Parameters estimates
Parameter Mean HPD_lower HPD_higher
lambda[Avaí] -1.912 -3.331 -0.627
lambda[Internacional] 0.162 -1.031 1.440
lambda[Cruzeiro] -0.683 -1.921 0.520
lambda[Botafogo-rj] -0.584 -1.862 0.637
lambda[Vasco] -0.067 -1.286 1.137
lambda[Corinthians] 0.310 -0.867 1.576
lambda[CSA] -1.101 -2.368 0.143
lambda[Goiás] -0.061 -1.335 1.122
lambda[Fortaleza] 0.133 -1.107 1.352
lambda[Santos] 1.070 -0.217 2.291
lambda[Grêmio] 0.631 -0.573 1.912
lambda[Palmeiras] 1.245 -0.011 2.514
lambda[Atlético-MG] -0.048 -1.340 1.133
lambda[Chapecoense] -0.986 -2.280 0.272
lambda[Ceará] -0.729 -2.022 0.433
lambda[Athlético-PR] 0.717 -0.580 1.919
lambda[Flamengo] 2.042 0.696 3.372
lambda[São Paulo] 0.616 -0.654 1.849
lambda[Bahia] -0.129 -1.353 1.098
lambda[Fluminense] -0.378 -1.594 0.903
nu 0.385 0.159 0.624

Ranking the teams

get_rank_of_players_table(m_football, format = 'html')
Estimated posterior ranks
Parameter MedianRank MeanRank StdRank
lambda[Flamengo] 1 1.286 0.692
lambda[Palmeiras] 3 3.195 1.920
lambda[Santos] 3 3.746 1.946
lambda[Athlético-PR] 5 5.633 2.810
lambda[Grêmio] 6 6.027 2.972
lambda[São Paulo] 6 6.022 2.800
lambda[Corinthians] 8 8.073 3.065
lambda[Fortaleza] 9 9.455 3.391
lambda[Internacional] 9 9.252 3.445
lambda[Atlético-MG] 11 10.880 3.472
lambda[Goiás] 11 10.969 3.493
lambda[Vasco] 11 10.975 3.362
lambda[Bahia] 12 11.527 3.395
lambda[Fluminense] 14 13.380 3.363
lambda[Botafogo-rj] 15 14.759 3.008
lambda[Ceará] 16 15.401 2.785
lambda[Cruzeiro] 16 15.271 2.865
lambda[Chapecoense] 18 17.036 2.276
lambda[CSA] 18 17.419 2.145
lambda[Avaí] 20 19.694 0.733

Plot

plot(m_football, rotate_x_labels = T)