# Define a function for formatting numbers
<- function(x, d = 2) format(x, digits = d, big.mark = ",") comma
ROC and MTMM
library(readxl) # for reading excel data
library(haven) # for reading SPSS data
library(ROCit) # for ROC analysis
library(psych)
library(tidyverse)
library(modelsummary) # for summarizing data
Classification Accuracy
Data analyzed in this paper: https://journals.sagepub.com/doi/10.1177/07342829211067128
<- tempfile(fileext = "xlsx") # temporary file
tmp_path download.file("https://osf.io/download/tsm7x/", destfile = tmp_path)
<- readxl::read_xlsx(tmp_path, na = "NA")
dat1 $asd <- recode(
dat1$`Diagnosis(1=Non-typically developing; 2=ASD; 3=Neurotypical)`,
dat1`1` = "Non-TD",
`2` = "ASD",
`3` = "TD"
)# Filter out Neurotypical
<- filter(dat1, asd != "TD") dat1_sub
First, sum the 16 items. Note that the sum is missing if any one item is missing
$ADEC <- rowSums(select(dat1_sub, ADEC_I01:ADEC_I16)) # NA if any item is NA
dat1_sub# Or treat missing as 0
$ADEC_rm_na <- rowSums(select(dat1_sub, ADEC_I01:ADEC_I16), na.rm = TRUE) dat1_sub
Now, we can look at the distribution of ADEC sum score by diagnosis:
ggplot(dat1_sub, aes(x = ADEC)) +
geom_histogram() +
facet_wrap(~ asd)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 12 rows containing non-finite values (`stat_bin()`).
Classification Table
Let’s first use ADEC = 11 as cutoff. We then get the following contingency table
with(dat1_sub,
table(ADEC >= 11, asd))
asd
ASD Non-TD
FALSE 13 68
TRUE 86 13
So the accuracy with this cutoff is (86 + 68) / 180 = 0.86. The sensitivity is 86 / (86 + 13) = 0.87, and the specificity is 68 / (68 + 13) = 0.84.
ROC
We can plot an ROC curve
<- function(
compute_sens x = dat1_sub$ADEC,
cut, crit = dat1_sub$asd == "ASD") {
<- sum(x >= cut & crit, na.rm = TRUE)
tp <- sum(x < cut & crit, na.rm = TRUE)
fn / (tp + fn)
tp
}<- function(
compute_spec x = dat1_sub$ADEC,
cut, crit = dat1_sub$asd == "ASD") {
<- sum(x < cut & !crit, na.rm = TRUE)
tn <- sum(x >= cut & !crit, na.rm = TRUE)
fp / (tn + fp)
tn
}<- lapply(32:0, compute_sens) |>
sensitivity unlist()
<- lapply(32:0, compute_spec) |>
specificity unlist()
data.frame(sensitivity, fpr = 1 - specificity) |>
ggplot(aes(x = fpr, y = sensitivity)) +
geom_point() +
geom_line()
# AUC
<- c(diff(1 - specificity), 0)
dfpr <- c(diff(sensitivity), 0)
dsens sum(sensitivity * dfpr) + sum(dsens * dfpr) / 2
[1] 0.920626
You can also use the ROCit package
<- rocit(dat1_sub$ADEC, class = dat1_sub$asd == "ASD") roc_adec
Warning in rocit(dat1_sub$ADEC, class = dat1_sub$asd == "ASD"): NA(s) in score
and/or class, removed from the data.
plot(roc_adec)
summary(roc_adec)
Method used: empirical
Number of positive(s): 99
Number of negative(s): 81
Area under curve: 0.9206
ciAUC(roc_adec)
estimated AUC : 0.920626013218606
AUC estimation method : empirical
CI of AUC
confidence level = 95%
lower = 0.880257283328194 upper = 0.960994743109017
MTMM
Data from this paper: https://drive.google.com/file/d/1O4_MVWchC5kyCaf5p7GH4AjRIrY8zHJ8/
<- haven::read_sav("https://osf.io/download/5ayjb/")
dat <- haven::read_sav("https://osf.io/download/3uecq/") dat_red
The data set contains scale scores for Neuroticism (N), Extraversion (E), Openness (O), Agreeableness (A), and Conscientiousness (C) with four methods of assessment: Self-reports (SR), from a close-other who is a female (RF), from a close-other who is a male (RM), implicit association tests (IAT), and behavioral assessment (BIH). You can see the labels in the SPSS data set using
# Loop over each column, and call `attr(which = "label")` for each column
# to obtain the label stored as an attribute
<- lapply(dat_red, FUN = attr, which = "label")
dat_labels # Put things into a table (tibble format)
tibble(variable = names(dat_red), label = unlist(dat_labels))
To make the size of the matrix a bit more manageable, we will select only E, A, and C for illustration, and with the SR, RF, and IAT methods.
Descriptive Statistics
Here are the descriptives:
<- c("ESR", "ASR", "CSR", "ERF", "ARF", "CRF", "EIAT", "AIAT", "CIAT")
var_names datasummary_skim(dat_red[var_names])
Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
---|---|---|---|---|---|---|---|---|
ESR | 73 | 0 | 111.5 | 23.1 | 41.0 | 112.0 | 163.0 | |
ASR | 79 | 0 | 114.9 | 24.3 | 35.0 | 116.5 | 163.0 | |
CSR | 73 | 0 | 126.7 | 24.2 | 60.0 | 131.5 | 175.0 | |
ERF | 63 | 0 | 111.7 | 21.0 | 47.0 | 112.0 | 159.0 | |
ARF | 69 | 0 | 122.1 | 20.6 | 57.0 | 124.0 | 162.0 | |
CRF | 67 | 0 | 140.7 | 22.5 | 75.0 | 141.0 | 182.0 | |
EIAT | 146 | 0 | 0.2 | 0.4 | −1.1 | 0.2 | 1.1 | |
AIAT | 146 | 0 | 0.5 | 0.4 | −0.5 | 0.5 | 1.3 | |
CIAT | 146 | 0 | 0.5 | 0.4 | −1.0 | 0.5 | 1.6 |
MTMM Matrix
datasummary_correlation(dat_red[var_names])
ESR | ASR | CSR | ERF | ARF | CRF | EIAT | AIAT | CIAT | |
---|---|---|---|---|---|---|---|---|---|
ESR | 1 | . | . | . | . | . | . | . | . |
ASR | .19 | 1 | . | . | . | . | . | . | . |
CSR | .20 | .31 | 1 | . | . | . | . | . | . |
ERF | .60 | .26 | .08 | 1 | . | . | . | . | . |
ARF | .08 | .57 | .09 | .24 | 1 | . | . | . | . |
CRF | .06 | .17 | .46 | .28 | .30 | 1 | . | . | . |
EIAT | .07 | −.06 | .10 | .01 | −.03 | −.01 | 1 | . | . |
AIAT | −.03 | −.12 | −.05 | −.19 | −.03 | .02 | .11 | 1 | . |
CIAT | .01 | .03 | .08 | −.12 | .01 | .06 | .05 | .28 | 1 |