Packages

library(bpcs)
library(tidyverse)
library(knitr)
library(kableExtra)
options(mc.cores = parallel::detectCores())
rstan::rstan_options(auto_write = TRUE)
set.seed(99)

Gorillas using touchscreen

Huskisson, S.M., Jacobson, S.L., Egelkamp, C.L. et al. Using a Touchscreen Paradigm to Evaluate Food Preferences and Response to Novel Photographic Stimuli of Food in Three Primate Species (Gorilla gorilla gorilla, Pan troglodytes, and Macaca fuscata). Int J Primatol 41, 5–23 (2020). https://doi.org/10.1007/s10764-020-00131-0

Let’s import the data and select only the gorillas

d <- readr::read_csv('data/touchscreen.csv') %>% 
  dplyr::filter(species_type=='Gorilla') %>% 
  dplyr::select(species_type, SubjectCode, Trial, image1, image2, image_chosen)

Let’s create a column that says which image was selected 1 for image2 0 for image 1

d <- d %>% 
  dplyr::mutate(y =ifelse(image_chosen==image1, 0, 1)) %>% 
  select(-image_chosen)

Let’s also do some recoding to facilitate interpreting

d$image1 <- dplyr::recode(d$image1, "Ca" = 'Carrot')
d$image1 <- dplyr::recode(d$image1, "Cu" = 'Cucumber')
d$image1 <- dplyr::recode(d$image1, "Tu" = 'Turnip')
d$image1 <- dplyr::recode(d$image1, "Gr" = 'Grape')
d$image1 <- dplyr::recode(d$image1, "To" = 'Tomato')
d$image1 <- dplyr::recode(d$image1, "Ap" = 'Apple')

d$image2 <- dplyr::recode(d$image2, "Ca" = 'Carrot')
d$image2 <- dplyr::recode(d$image2, "Cu" = 'Cucumber')
d$image2 <- dplyr::recode(d$image2, "Tu" = 'Turnip')
d$image2 <- dplyr::recode(d$image2, "Gr" = 'Grape')
d$image2 <- dplyr::recode(d$image2, "To" = 'Tomato')
d$image2 <- dplyr::recode(d$image2, "Ap" = 'Apple')

Sample of the data

d %>% 
  dplyr::sample_n(size=10) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")

species_type	SubjectCode	Trial	image1	image2	y
Gorilla	Gorilla3	2	Grape	Cucumber	0
Gorilla	Gorilla6	16	Grape	Turnip	0
Gorilla	Gorilla6	14	Grape	Tomato	1
Gorilla	Gorilla2	2	Apple	Tomato	0
Gorilla	Gorilla4	14	Apple	Turnip	0
Gorilla	Gorilla4	5	Carrot	Grape	0
Gorilla	Gorilla5	13	Tomato	Turnip	0
Gorilla	Gorilla5	13	Apple	Carrot	0
Gorilla	Gorilla6	21	Carrot	Grape	1
Gorilla	Gorilla5	28	Apple	Turnip	0

Now we can run a Bradley-Terry model with the bpcs package

m1_gor <-
  bpc(
    d,
    player0 = 'image1',
    player1 = 'image2',
    result_column = 'y',
    model_type = 'bt',
    priors = list(prior_lambda_std = 0.5),
    iter = 3000
  )
save_bpc_model(m1_gor, 'm1_gor', 'fittedmodels')

Now that we got our posterior we can look at some interesting tables and plots

plot(m1_gor, rotate_x_labels = T)+
  scale_x_discrete(
  labels = c(
    "Turnip",
    "Cucumber",
    "Carrot",
    "Apple",
    "Grape",
    "Tomato"
  ))

Getting a probability table

get_probabilities_table(m1_gor, format='html')

Estimated posterior probabilites
i	j	i_beats_j	j_beats_i
Apple	Carrot	0.54	0.46
Apple	Cucumber	0.68	0.32
Apple	Grape	0.36	0.64
Apple	Tomato	0.39	0.61
Apple	Turnip	0.74	0.26
Carrot	Cucumber	0.64	0.36
Carrot	Grape	0.25	0.75
Carrot	Tomato	0.26	0.74
Carrot	Turnip	0.73	0.27
Cucumber	Grape	0.15	0.85
Cucumber	Tomato	0.17	0.83
Cucumber	Turnip	0.51	0.49
Grape	Tomato	0.58	0.42
Grape	Turnip	0.83	0.17
Tomato	Turnip	0.83	0.17

Automated labeling

Let’s start importing the data

d2 <- read_csv('./data/auto-label-comparison.csv') %>%
  dplyr::rename(Value='value') %>% 
  dplyr::filter(ValueType=='accuracy') %>% 
  dplyr::filter(Model=='LabelPropagation_knn' | 
                Model=='LabelPropagation_rbf' |
                Model=='LabelSpreading_knn' |
                Model=='LabelSpreading_rbf')

## Warning: Missing column names filled in: 'X1' [1]

Lets visualize

d2 %>% 
  dplyr::sample_n(size=10) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")

X1	ValueType	Value	Iteration	Variable	Model	Dataset	DataType
27	accuracy	0.6967609	7	50%	LabelSpreading_rbf	digits	image
22	accuracy	0.3333333	2	50%	LabelSpreading_knn	corpus	text
23	accuracy	0.1666667	3	50%	LabelPropagation_knn	reuters	text
8	accuracy	0.0181818	8	10%	LabelPropagation_rbf	cifar	image
28	accuracy	0.0004808	8	50%	LabelPropagation_rbf	cifar2	image
24	accuracy	0.2756910	4	50%	LabelPropagation_knn	german	numeric
4	accuracy	0.0181818	4	10%	LabelPropagation_knn	mnist	image
9	accuracy	0.1666667	9	10%	LabelPropagation_knn	reuters	text
1	accuracy	0.1280959	1	10%	LabelPropagation_rbf	20news	text
20	accuracy	0.2635986	0	50%	LabelPropagation_knn	musk1	numeric

Now we need to convert it to paired comparison: First we pivot it longer and them we expand the dataset

d_acc_rank <- d2 %>% 
  dplyr::group_by(Dataset, Variable, Iteration) %>%
  dplyr::mutate(Rank=-rank(Value, ties.method = 'random')) %>% 
  dplyr::ungroup() %>% 
  dplyr::select(-Value) %>% #we need to drop the Value variable to pivot wider
  tidyr::pivot_wider(names_from = Model,
                     values_from=Rank)

Expanding it

models<-unique(d2$Model)
n_models = length(models)
comb <- gtools::combinations(n=n_models, r=2, v=seq(1:n_models), repeats.allowed = F) #all teh paired combinations

d_acc_bt <-  dplyr::tribble(~model0, ~model1, ~y, ~Iteration, ~Dataset, ~DatasetType)

#now we loop each row of the rank wide dataset and create a new one
for(i in 1:nrow(d_acc_rank))
{
  current_row <- d_acc_rank[i,]
  for(j in 1:nrow(comb)){
    comb_row <- comb[j,]
    
    model0 <- models[comb_row[1]]
    model0_rank <- current_row[[1,model0]]
    
    model1 <- models[comb_row[2]]
    model1_rank <- current_row[[1,model1]]
    
    diff_rank <- model1_rank - model0_rank
    #SInce higher accuracy is better if model 1 rank- model 0 rank is positive than model1 wins and y=1 else y=0
    y <- ifelse(diff_rank>0, 1, 0) 
    
    
    d_acc_bt <-d_acc_bt %>%  
      add_row(model0=model0,
              model1=model1,
              y=y,
              Iteration=current_row$Iteration,
              Dataset=current_row$Dataset,
              DatasetType=current_row$DataType)
    
  }
}

d_acc_bt %>% 
  dplyr::sample_n(size=20) %>% 
  kable() %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  kableExtra::scroll_box(width = "100%")

model0	model1	y	Iteration	Dataset	DatasetType
LabelPropagation_rbf	LabelSpreading_rbf	1	0	wine	numeric
LabelSpreading_knn	LabelSpreading_rbf	0	8	wine	numeric
LabelPropagation_rbf	LabelSpreading_knn	0	5	corpus	text
LabelPropagation_rbf	LabelSpreading_rbf	0	6	wine	numeric
LabelPropagation_knn	LabelPropagation_rbf	1	3	cifar	image
LabelPropagation_rbf	LabelSpreading_rbf	0	0	corpus	text
LabelPropagation_knn	LabelSpreading_rbf	0	7	20news	text
LabelPropagation_knn	LabelSpreading_rbf	0	8	reuters	text
LabelPropagation_rbf	LabelSpreading_knn	0	5	musk1	numeric
LabelSpreading_knn	LabelSpreading_rbf	1	9	cifar	image
LabelPropagation_knn	LabelSpreading_rbf	0	1	20news	text
LabelPropagation_rbf	LabelSpreading_rbf	1	4	ohsumed	text
LabelSpreading_knn	LabelSpreading_rbf	1	5	cifar	image
LabelSpreading_knn	LabelSpreading_rbf	0	1	mnist	image
LabelSpreading_knn	LabelSpreading_rbf	1	9	iris	numeric
LabelPropagation_knn	LabelSpreading_knn	0	3	corpus	text
LabelPropagation_rbf	LabelSpreading_rbf	0	9	cifar2	image
LabelPropagation_knn	LabelSpreading_knn	1	8	digits	image
LabelPropagation_rbf	LabelSpreading_knn	1	9	ohsumed	text
LabelPropagation_knn	LabelSpreading_knn	1	1	corpus	text

Now we can do some paired comparison models

m1_autol <-
  bpc(
    d_acc_bt,
    player0 = 'model0',
    player1 = 'model1',
    result_column = 'y',
    model_type = 'bt-U',
    cluster = c("Dataset"),
    priors = list(prior_lambda_std = 0.5, prior_U1_std=1.0),
    iter = 3000
  )
save_bpc_model(m1_autol, 'm1_autol', 'fittedmodels')

get_probabilities_table(m1_autol, format = 'html', model_type = 'bt')

Estimated posterior probabilites
i	j	i_beats_j	j_beats_i
LabelPropagation_knn	LabelPropagation_rbf	0.33	0.67
LabelPropagation_knn	LabelSpreading_knn	0.45	0.55
LabelPropagation_knn	LabelSpreading_rbf	0.45	0.55
LabelPropagation_rbf	LabelSpreading_knn	0.62	0.38
LabelPropagation_rbf	LabelSpreading_rbf	0.56	0.44
LabelSpreading_knn	LabelSpreading_rbf	0.41	0.59

get_rank_of_players_table(m1_autol, format = 'html')

Estimated posterior ranks
Parameter	MedianRank	MeanRank	StdRank
lambda[LabelPropagation_rbf]	1	1.426	0.756
lambda[LabelPropagation_knn]	3	2.700	0.976
lambda[LabelSpreading_knn]	3	2.812	0.959
lambda[LabelSpreading_rbf]	3	3.062	0.976

plot(m1_autol, rotate_x_labels = T)+
  scale_x_discrete(
  labels = c(
    "LS-RBF",
    "LS-KNN",
    "LP-KNN",
    "LP-RBF"
  ))

Football

The data comes in the bpcs package

data("brasil_soccer_league")
knitr::kable(head(brasil_soccer_league))

Time	WeekDay	Date	HomeTeam	VisitorTeam	Round	Stadium	ScoreHomeTeam	ScoreVisitorTeam
16:00	Saturday	2017-05-13	Flamengo	Atlético-MG	1	Maracanã	1	1
19:00	Saturday	2017-05-13	Corinthians	Chapecoense	1	Arena Corinthians	1	1
16:00	Sunday	2017-05-14	Avaí	Vitória	1	Ressacada	0	0
16:00	Sunday	2017-05-14	Bahia	Athlético-PR	1	Fonte Nova	6	2
16:00	Sunday	2017-05-14	Cruzeiro	São Paulo	1	Mineirão	1	0
11:00	Sunday	2017-05-14	Fluminense	Santos	1	Maracanã	3	2

Let’s analyze only the data from 2019 and remove a few columns that are not relevant for this example:

d3 <- brasil_soccer_league %>%
  dplyr::filter(Date >= as.Date("2019-01-01") &
                  Date <= as.Date("2019-12-31")) %>%
  dplyr::select(HomeTeam, VisitorTeam, ScoreHomeTeam, ScoreVisitorTeam, Round)

m_football <- bpc(d3, 
          player0 = 'VisitorTeam', 
          player1 =  'HomeTeam', 
          player0_score = 'ScoreVisitorTeam',
          player1_score = 'ScoreHomeTeam',
          model_type = 'davidson',
          solve_ties = 'none',
          priors = list(prior_lambda_std=2.0), # making a more informative prior to improve convergence
          iter = 3000) #stan indicates a low bulk ESS so we are increasing the number of iterations
save_bpc_model(m_football, 'm_football', 'fittedmodels')

Getting the table with the parameters:

get_parameters_table(m_football, format = 'html')

Parameters estimates
Parameter	Mean	HPD_lower	HPD_higher
lambda[Avaí]	-1.912	-3.331	-0.627
lambda[Internacional]	0.162	-1.031	1.440
lambda[Cruzeiro]	-0.683	-1.921	0.520
lambda[Botafogo-rj]	-0.584	-1.862	0.637
lambda[Vasco]	-0.067	-1.286	1.137
lambda[Corinthians]	0.310	-0.867	1.576
lambda[CSA]	-1.101	-2.368	0.143
lambda[Goiás]	-0.061	-1.335	1.122
lambda[Fortaleza]	0.133	-1.107	1.352
lambda[Santos]	1.070	-0.217	2.291
lambda[Grêmio]	0.631	-0.573	1.912
lambda[Palmeiras]	1.245	-0.011	2.514
lambda[Atlético-MG]	-0.048	-1.340	1.133
lambda[Chapecoense]	-0.986	-2.280	0.272
lambda[Ceará]	-0.729	-2.022	0.433
lambda[Athlético-PR]	0.717	-0.580	1.919
lambda[Flamengo]	2.042	0.696	3.372
lambda[São Paulo]	0.616	-0.654	1.849
lambda[Bahia]	-0.129	-1.353	1.098
lambda[Fluminense]	-0.378	-1.594	0.903
nu	0.385	0.159	0.624

Ranking the teams

get_rank_of_players_table(m_football, format = 'html')

Estimated posterior ranks
Parameter	MedianRank	MeanRank	StdRank
lambda[Flamengo]	1	1.286	0.692
lambda[Palmeiras]	3	3.195	1.920
lambda[Santos]	3	3.746	1.946
lambda[Athlético-PR]	5	5.633	2.810
lambda[Grêmio]	6	6.027	2.972
lambda[São Paulo]	6	6.022	2.800
lambda[Corinthians]	8	8.073	3.065
lambda[Fortaleza]	9	9.455	3.391
lambda[Internacional]	9	9.252	3.445
lambda[Atlético-MG]	11	10.880	3.472
lambda[Goiás]	11	10.969	3.493
lambda[Vasco]	11	10.975	3.362
lambda[Bahia]	12	11.527	3.395
lambda[Fluminense]	14	13.380	3.363
lambda[Botafogo-rj]	15	14.759	3.008
lambda[Ceará]	16	15.401	2.785
lambda[Cruzeiro]	16	15.271	2.865
lambda[Chapecoense]	18	17.036	2.276
lambda[CSA]	18	17.419	2.145
lambda[Avaí]	20	19.694	0.733

Plot

plot(m_football, rotate_x_labels = T)

Environment

sessionInfo()

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.2.1 knitr_1.30       forcats_0.5.0    stringr_1.4.0   
##  [5] dplyr_1.0.2      purrr_0.3.4      readr_1.3.1      tidyr_1.1.2     
##  [9] tibble_3.0.4     ggplot2_3.3.3    tidyverse_1.3.0  bpcs_1.1.0      
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2            viridisLite_0.3.0     jsonlite_1.7.2       
##  [4] modelr_0.1.8          RcppParallel_5.0.2    StanHeaders_2.21.0-7 
##  [7] assertthat_0.2.1      highr_0.8             BiocManager_1.30.10  
## [10] rvcheck_0.1.8         stats4_4.0.3          pander_0.6.3         
## [13] blob_1.2.1            cellranger_1.1.0      yaml_2.2.1           
## [16] pillar_1.4.7          backports_1.2.1       glue_1.4.2           
## [19] jtools_2.1.2          digest_0.6.27         RColorBrewer_1.1-2   
## [22] rvest_0.3.6           colorspace_2.0-0      htmltools_0.5.1      
## [25] pkgconfig_2.0.3       rstan_2.21.2          broom_0.7.0          
## [28] haven_2.3.1           webshot_0.5.2         scales_1.1.1         
## [31] processx_3.4.5        farver_2.0.3          generics_0.1.0       
## [34] ellipsis_0.3.1        withr_2.3.0           cli_2.2.0            
## [37] magrittr_2.0.1        crayon_1.3.4          readxl_1.3.1         
## [40] evaluate_0.14         ps_1.5.0              badger_0.0.9         
## [43] fs_1.5.0              fansi_0.4.1           xml2_1.3.2           
## [46] pkgbuild_1.2.0        tools_4.0.3           loo_2.4.1            
## [49] prettyunits_1.1.1     hms_0.5.3             lifecycle_0.2.0      
## [52] matrixStats_0.57.0    V8_3.4.0              munsell_0.5.0        
## [55] reprex_0.3.0          callr_3.5.1           compiler_4.0.3       
## [58] rlang_0.4.10          grid_4.0.3            rstudioapi_0.13      
## [61] labeling_0.4.2        rmarkdown_2.6         gtable_0.3.0         
## [64] codetools_0.2-16      inline_0.3.17         DBI_1.1.0            
## [67] curl_4.3              R6_2.5.0              gridExtra_2.3        
## [70] rstantools_2.1.1.9000 lubridate_1.7.9       dlstats_0.1.3        
## [73] stringi_1.5.3         parallel_4.0.3        Rcpp_1.0.5           
## [76] vctrs_0.3.6           dbplyr_1.4.4          tidyselect_1.1.0     
## [79] xfun_0.20

SE Seminar

David Issa Mattos

1/26/2021

Packages

Gorillas using touchscreen

Automated labeling

Football

Environment