Exploratory Factor Analysis

library(tidyverse)
library(psych)
library(dagitty)
library(plotly)
library(lavaan)
library(EFAtools)
library(flextable)

Path Diagram

Personally, it may be easier to use powerpoint or other tools with a graphical interface. But if you want to do it with code, here is an example with Graphviz.

Graphviz

digraph FactorModel {
    node [shape=circle,fontsize=12];
    F1; F2;

    node [shape=rectangle];
    x1; x2; x3; x4; x5; x6;

    edge [fontsize=10];
    F2 -> F1 [label="r" constraint=false dir="both"];
    F1 -> x1 [label="a"];
    F1 -> x2 [label="b"];
    F1 -> x3 [label="c"];
    F1 -> x4 [label="d"]
    F2 -> x4 [label="e"];
    F2 -> x5 [label="f"];
    F2 -> x6 [label="g"];

    node [shape=none,fontsize=10];
    u1 -> x1;
    u2 -> x2;
    u3 -> x3;
    u4 -> x4;
    u5 -> x5;
    u6 -> x6;

    {rank=same; x1 x2 x3 x4 x5 x6}
    {rank=min; F1 F2}
    {rank=max; u1 u2 u3 u4 u5 u6}
}

Path diagram of a two-factor model.

DAG

A factor model can also be represented as a directed acyclic graph (DAG). The dagitty package can show the vanishing tetrad assumptions:

dag1 <- dagitty("dag{ F1 -> {1 2 3 4}; F2 -> {4 5 6}; F1 <-> F2 }")
latents(dag1) <- c("F1", "F2")
coordinates(dag1) <- list(
    x = c(F1 = 0, F2 = 2, `1` = -1.5, `2` = -0.5, `3` = 0.5, `4` = 1.5,
          `5` = 2.5, `6` = 3.5),
    y = c(F1 = 0, F2 = 0, `1` = 1, `2` = 1, `3` = 1, `4` = 1,
          `5` = 1, `6` = 1)
)
plot(dag1)

vanishingTetrads(dag1)

      [,1] [,2] [,3] [,4]
 [1,] "1"  "3"  "4"  "2" 
 [2,] "1"  "2"  "4"  "3" 
 [3,] "1"  "2"  "3"  "4" 
 [4,] "1"  "3"  "5"  "2" 
 [5,] "1"  "2"  "5"  "3" 
 [6,] "1"  "2"  "3"  "5" 
 [7,] "1"  "3"  "6"  "2" 
 [8,] "1"  "2"  "6"  "3" 
 [9,] "1"  "2"  "3"  "6" 
[10,] "1"  "4"  "5"  "2" 
[11,] "1"  "4"  "6"  "2" 
[12,] "1"  "5"  "6"  "2" 
[13,] "1"  "4"  "5"  "3" 
[14,] "1"  "4"  "6"  "3" 
[15,] "1"  "5"  "6"  "3" 
[16,] "1"  "5"  "6"  "4" 
[17,] "2"  "4"  "5"  "3" 
[18,] "2"  "4"  "6"  "3" 
[19,] "2"  "5"  "6"  "3" 
[20,] "2"  "5"  "6"  "4" 
[21,] "3"  "5"  "6"  "4"

Correlation Matrix

# Covariance
cov_oa <- cov(
    select(bfi, O1:O5, A1:A5),
    # Listwise deletion
    use = "complete"
)
# Correlation
corr_oa <- cor(
    select(bfi, O1:O5, A1:A5),
    # Listwise deletion
    use = "complete"
)
round(corr_oa, digits = 2)

      O1    O2    O3    O4    O5    A1    A2    A3    A4    A5
O1  1.00 -0.22  0.39  0.19 -0.24  0.01  0.13  0.15  0.06  0.16
O2 -0.22  1.00 -0.28 -0.08  0.33  0.07  0.01  0.00  0.04 -0.01
O3  0.39 -0.28  1.00  0.19 -0.32 -0.06  0.16  0.23  0.07  0.24
O4  0.19 -0.08  0.19  1.00 -0.18 -0.08  0.08  0.04 -0.04  0.02
O5 -0.24  0.33 -0.32 -0.18  1.00  0.11 -0.09 -0.06  0.02 -0.06
A1  0.01  0.07 -0.06 -0.08  0.11  1.00 -0.34 -0.27 -0.15 -0.19
A2  0.13  0.01  0.16  0.08 -0.09 -0.34  1.00  0.49  0.33  0.39
A3  0.15  0.00  0.23  0.04 -0.06 -0.27  0.49  1.00  0.37  0.51
A4  0.06  0.04  0.07 -0.04  0.02 -0.15  0.33  0.37  1.00  0.31
A5  0.16 -0.01  0.24  0.02 -0.06 -0.19  0.39  0.51  0.31  1.00

Polychoric Correlation

The items are technically ordinal, but with six points it is usually less a problem to just use Pearson correlation. We can use the lavaan package to get the polychoric correlation.

pcorr_oa <- lavaan::lavCor(select(bfi, O1:O5, A1:A5), ordered = TRUE)
pcorr_oa

       O1     O2     O3     O4     O5     A1     A2     A3     A4     A5
O1  1.000                                                               
O2 -0.273  1.000                                                        
O3  0.450 -0.337  1.000                                                 
O4  0.258 -0.106  0.255  1.000                                          
O5 -0.304  0.373 -0.381 -0.250  1.000                                   
A1 -0.010  0.079 -0.086 -0.100  0.134  1.000                            
A2  0.163 -0.002  0.189  0.109 -0.114 -0.411  1.000                     
A3  0.187 -0.029  0.270  0.065 -0.089 -0.327  0.565  1.000              
A4  0.071  0.035  0.082 -0.049  0.026 -0.182  0.390  0.422  1.000       
A5  0.195 -0.018  0.275  0.028 -0.074 -0.234  0.453  0.576  0.358  1.000

Heat Map

corr_oa |>
    as_tibble(rownames = "Var1") |>
    pivot_longer(cols = -Var1, names_to = "Var2") |>
    mutate(across(1:2, .fns = as.ordered)) |>
    filter(Var2 > Var1) |>
    ggplot(aes(x = Var2, y = Var1, fill = value)) +
    geom_tile(color = "white") +
    geom_text(aes(Var2, Var1, label = rmlead0(value)), color = "black", size = 4) +
    scale_fill_gradient2(low = "blue", high = "red", limit = c(-1, 1), name = "Pearson\nCorrelation") +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.background = element_blank()) +
    coord_fixed()

Factor Extraction

Eigen-Decomposition

A good video to explain this: https://www.youtube.com/watch?v=86ODrk1nB-g

# Simulate 3-D Data
lambda_mat <- matrix(c(.8, .7, .5, .1, .5, .4), nrow = 3)
sigma_mat <- tcrossprod(lambda_mat)
diag(sigma_mat) <- 1
y <- MASS::mvrnorm(500, mu = rep(0, 3), Sigma = sigma_mat, empirical = TRUE)

# library(scatterplot3d)
# scatterplot3d(y)
dfy <- as.data.frame(y)
names(dfy) <- c("y1", "y2", "y3")
plot_ly(dfy, x = ~ y1, y = ~ y2, z = ~ y3)

No trace type specified:
  Based on info supplied, a 'scatter3d' trace seems appropriate.
  Read more about this trace type -> https://plotly.com/r/reference/#scatter3d

No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Estimating Communalities

Squared multiple correlations (\(R^2\))

Use squared multiple \(R^2\) (not efficient)

# E.g., Regress A1 on all other variables
summary(lm(A1 ~ ., data = select(bfi, O1:O5, A1:A5)))


Call:
lm(formula = A1 ~ ., data = select(bfi, O1:O5, A1:A5))

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5113 -0.9180 -0.3207  0.7330  4.5763 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.95198    0.20993  18.825  < 2e-16 ***
O1           0.12114    0.02488   4.869 1.19e-06 ***
O2           0.07446    0.01761   4.228 2.44e-05 ***
O3           0.04967    0.02429   2.045  0.04098 *  
O4          -0.06568    0.02182  -3.010  0.00263 ** 
O5           0.08494    0.02114   4.018 6.03e-05 ***
A2          -0.32318    0.02565 -12.599  < 2e-16 ***
A3          -0.13979    0.02469  -5.662 1.66e-08 ***
A4          -0.02005    0.01885  -1.063  0.28765    
A5          -0.03240    0.02404  -1.348  0.17780    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.295 on 2637 degrees of freedom
  (153 observations deleted due to missingness)
Multiple R-squared:  0.1541,    Adjusted R-squared:  0.1512 
F-statistic: 53.37 on 9 and 2637 DF,  p-value: < 2.2e-16

# E.g., Regress O1 on all other variables
summary(lm(O1 ~ ., data = select(bfi, O1:O5, A1:A5)))


Call:
lm(formula = O1 ~ ., data = select(bfi, O1:O5, A1:A5))

Residuals:
    Min      1Q  Median      3Q     Max 
-4.5570 -0.5727  0.1064  0.7152  2.6925 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2.730613   0.165923  16.457  < 2e-16 ***
O2          -0.080779   0.013678  -5.906 3.96e-09 ***
O3           0.264794   0.018230  14.525  < 2e-16 ***
O4           0.098933   0.016922   5.846 5.64e-09 ***
O5          -0.082739   0.016445  -5.031 5.20e-07 ***
A1           0.073559   0.015107   4.869 1.19e-06 ***
A2           0.052696   0.020556   2.564  0.01042 *  
A3           0.032197   0.019345   1.664  0.09616 .  
A4           0.008584   0.014692   0.584  0.55911    
A5           0.049964   0.018712   2.670  0.00763 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.009 on 2637 degrees of freedom
  (153 observations deleted due to missingness)
Multiple R-squared:  0.2031,    Adjusted R-squared:  0.2004 
F-statistic: 74.67 on 9 and 2637 DF,  p-value: < 2.2e-16

A computational short cut is to use the inverse of the correlation matrix

1 - 1 / diag(solve(corr_oa))

       O1        O2        O3        O4        O5        A1        A2        A3 
0.2030977 0.1625871 0.2766737 0.0766234 0.1910893 0.1540951 0.3401059 0.4002928 
       A4        A5 
0.1885574 0.3138769

Iterative method

Here is sample code for principal axis factoring (using eigen)

rcorr_oa <- corr_oa # initialize reduced correlation
diag(rcorr_oa) <- 1 - 1 / diag(solve(corr_oa))  # replace communality
q <- 2  # number of factors
eps <- .0001  # convergence criterion
while (TRUE) {
    eigen_r <- eigen(rcorr_oa, symmetric = TRUE)
    lambdat <- t(eigen_r$vectors[, seq_len(q)]) * sqrt(eigen_r$values[seq_len(q)])
    # communality = sum of squared loadings
    new_h <- colSums(lambdat^2)
    if (max(abs(new_h - diag(rcorr_oa))) > eps) {
        diag(rcorr_oa) <- new_h
    } else {
        break  # stop when convergence is met
    }
}
# Estimates with the iterative method
diag(rcorr_oa)

        O1         O2         O3         O4         O5         A1         A2 
0.28265008 0.23396855 0.43541770 0.08877585 0.29766819 0.13249392 0.45364581 
        A3         A4         A5 
0.58815163 0.25098540 0.39616917

The final estimates are different from the initial ones.

Loading Matrix

# From our for loop for PAF
t(lambdat)

            [,1]       [,2]
 [1,]  0.3548181 -0.3959189
 [2,] -0.1755868  0.4506955
 [3,]  0.4741739 -0.4589373
 [4,]  0.1651205 -0.2480128
 [5,] -0.2778184  0.4695370
 [6,] -0.3486927 -0.1044201
 [7,]  0.6303503  0.2372005
 [8,]  0.7164531  0.2737558
 [9,]  0.4173173  0.2771697
[10,]  0.6018286  0.1842694

# Compared to `psych::fa()`
fa_unrotated <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2,
    rotate = "none",
    fm = "pa")
fa_unrotated$loadings


Loadings:
   PA1    PA2   
O1  0.356 -0.392
O2 -0.166  0.442
O3  0.472 -0.453
O4  0.163 -0.244
O5 -0.270  0.476
A1 -0.345       
A2  0.631  0.231
A3  0.705  0.271
A4  0.415  0.270
A5  0.601  0.187

                 PA1   PA2
SS loadings    2.025 1.084
Proportion Var 0.202 0.108
Cumulative Var 0.202 0.311

The values are not exactly the same as there are differences in implementation algorithms, but they are pretty close.

Number of Factors

Parallel Analysis

fa.parallel(
    select(bfi, O1:O5, A1:A5),
    fm = "pa",
    error.bars = TRUE
)

Parallel analysis suggests that the number of factors =  3  and the number of components =  2

It suggests 3 factors (with reduced correlation) and 2 components (with full correlation). In my experience, the number of components is usually more in line with theoretical expectations, but choosing either one would seem reasonable, and this should depend on theoretical considerations and interpretability of the solutions.

With polychoric correlation matrix

We need to first find the number of observations:

n_pcorr <- 
    select(bfi, O1:O5, A1:A5) |>
    drop_na() |>
    nrow()
n_pcorr

[1] 2647

psych::fa.parallel(
    pcorr_oa,
    n.obs = n_pcorr,
    fm = "pa",
    error.bars = TRUE
)

Parallel analysis suggests that the number of factors =  3  and the number of components =  2

Hull method

# Reduced correlation
HULL(select(bfi, O1:O5, A1:A5), eigen_type = "SMC")

ℹ Only CAF can be used as gof if method "PAF" is used. Setting gof to "CAF"

ℹ 'x' was not a correlation matrix. Correlations are found from entered raw data.

Hull Analysis performed testing 0 to 5 factors.
PAF estimation and the CAF fit index was used.

── Number of factors suggested by the Hull method ──────────────────────────────

◌ With CAF:    3

# Full correlation
HULL(select(bfi, O1:O5, A1:A5), eigen_type = "PCA")

ℹ Only CAF can be used as gof if method "PAF" is used. Setting gof to "CAF"
ℹ 'x' was not a correlation matrix. Correlations are found from entered raw data.

Hull Analysis performed testing 0 to 3 factors.
PAF estimation and the CAF fit index was used.

── Number of factors suggested by the Hull method ──────────────────────────────

◌ With CAF:    2

The conclusion is similar to parallel analysis.

Other methods

These should be examined as well. Note in this example, using fm = "pa" results in an error, so the suggestions are not based on PAF extraction.

psych::nfactors(select(bfi, O1:O5, A1:A5))


Number of factors
Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm, 
    n.obs = n.obs, plot = FALSE, title = title, use = use, cor = cor)
VSS complexity 1 achieves a maximimum of 0.6  with  2  factors
VSS complexity 2 achieves a maximimum of 0.7  with  6  factors
The Velicer MAP achieves a minimum of 0.03  with  2  factors 
Empirical BIC achieves a minimum of  -69.58  with  4  factors
Sample Size adjusted BIC achieves a minimum of  -27.76  with  4  factors

Statistics by number of factors 
   vss1 vss2   map dof   chisq     prob sqresid  fit RMSEA  BIC  SABIC complex
1  0.45 0.00 0.036  35 1.6e+03 1.3e-311     7.9 0.45 0.126 1310 1421.4     1.0
2  0.60 0.64 0.032  26 3.2e+02  5.4e-53     5.1 0.64 0.064  117  199.1     1.1
3  0.59 0.67 0.052  18 9.5e+01  1.7e-12     4.3 0.69 0.039  -48    9.4     1.3
4  0.54 0.68 0.087  11 2.5e+01  1.0e-02     3.9 0.73 0.021  -63  -27.8     1.5
5  0.56 0.68 0.134   5 2.8e+00  7.2e-01     3.7 0.74 0.000  -37  -21.0     1.6
6  0.56 0.70 0.213   0 5.5e-01       NA     3.4 0.76    NA   NA     NA     1.6
7  0.55 0.68 0.314  -4 1.6e-07       NA     3.5 0.76    NA   NA     NA     1.7
8  0.54 0.68 0.650  -7 2.2e-07       NA     3.5 0.75    NA   NA     NA     1.7
9  0.54 0.68 1.000  -9 0.0e+00       NA     3.5 0.75    NA   NA     NA     1.7
10 0.54 0.68    NA -10 0.0e+00       NA     3.5 0.75    NA   NA     NA     1.7
    eChisq    SRMR  eCRMS eBIC
1  3.1e+03 1.1e-01 0.1259 2827
2  3.7e+02 3.8e-02 0.0502  160
3  9.8e+01 2.0e-02 0.0312  -45
4  1.8e+01 8.4e-03 0.0170  -70
5  2.1e+00 2.9e-03 0.0086  -38
6  3.7e-01 1.2e-03     NA   NA
7  1.2e-07 6.8e-07     NA   NA
8  1.4e-07 7.5e-07     NA   NA
9  1.1e-14 2.1e-10     NA   NA
10 1.1e-14 2.1e-10     NA   NA

Based on theoretical consideration, two factors seem reasonable.

Rotation

Complexity

Danger

The computation of variable complexity and factor complexity here are just for demonstrative purpose. You should not choose a rotation method based on these, because different rotations have different goals and complexity functions.

Unrotated solution

plot(fa_unrotated, labels = c(paste0("O", 1:5), paste0("A", 1:5)))

# Variable Complexity
lambda_unrotated <- fa_unrotated$loadings
sum(lambda_unrotated[, 1]^2 * lambda_unrotated[, 2]^2)

[1] 0.172905

# Factor Complexity
fac_complexity <- 0
for (j in seq_len(ncol(lambda_unrotated))) {
    for (i in seq_len(nrow(lambda_unrotated))) {
        fac_complexity <- fac_complexity + sum(lambda_unrotated[i, j]^2 * lambda_unrotated[-i, j]^2)
    }
}
fac_complexity

[1] 4.447726

Varimax

fa_varimax <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, fm = "pa", rotate = "varimax")
plot(fa_varimax, labels = c(paste0("O", 1:5), paste0("A", 1:5)))

# Variable Complexity
(lambda_varimax <- fa_varimax$loadings)


Loadings:
   PA1    PA2   
O1  0.135  0.512
O2        -0.468
O3  0.210  0.620
O4         0.292
O5        -0.546
A1 -0.350       
A2  0.666       
A3  0.751       
A4  0.493       
A5  0.619  0.111

                 PA1   PA2
SS loadings    1.825 1.284
Proportion Var 0.182 0.128
Cumulative Var 0.182 0.311

sum(lambda_varimax[, 1]^2 * lambda_varimax[, 2]^2)

[1] 0.03597018

# Factor Complexity
fac_complexity <- 0
for (j in seq_len(ncol(lambda_varimax))) {
    for (i in seq_len(nrow(lambda_varimax))) {
        fac_complexity <- fac_complexity + sum(lambda_varimax[i, j]^2 * lambda_varimax[-i, j]^2)
    }
}
fac_complexity

[1] 3.877532

Oblimin

fa_oblimin <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, fm = "pa", rotate = "oblimin")

Loading required namespace: GPArotation

plot(fa_oblimin, labels = c(paste0("O", 1:5), paste0("A", 1:5)))

# Variable Complexity
(lambda_oblimin <- fa_oblimin$loadings)


Loadings:
   PA1    PA2   
O1         0.513
O2  0.136 -0.489
O3  0.112  0.616
O4         0.297
O5        -0.561
A1 -0.346       
A2  0.668       
A3  0.755       
A4  0.512 -0.104
A5  0.615       

                 PA1   PA2
SS loadings    1.816 1.300
Proportion Var 0.182 0.130
Cumulative Var 0.182 0.312

sum(lambda_oblimin[, 1]^2 * lambda_oblimin[, 2]^2)

[1] 0.01545015

# Factor Complexity
fac_complexity <- 0
for (j in seq_len(ncol(lambda_oblimin))) {
    for (i in seq_len(nrow(lambda_oblimin))) {
        fac_complexity <- fac_complexity + sum(lambda_oblimin[i, j]^2 * lambda_oblimin[-i, j]^2)
    }
}
fac_complexity

[1] 3.858235

Promax

fa_promax <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, fm = "pa", rotate = "promax")
plot(fa_promax, labels = c(paste0("O", 1:5), paste0("A", 1:5)))

# Variable Complexity
(lambda_promax <- fa_promax$loadings)


Loadings:
   PA1    PA2   
O1         0.507
O2  0.113 -0.484
O3  0.142  0.609
O4         0.294
O5        -0.555
A1 -0.348       
A2  0.669       
A3  0.755       
A4  0.508 -0.104
A5  0.618       

                 PA1   PA2
SS loadings    1.819 1.269
Proportion Var 0.182 0.127
Cumulative Var 0.182 0.309

sum(lambda_promax[, 1]^2 * lambda_promax[, 2]^2)

[1] 0.01632216

# Factor Complexity
fac_complexity <- 0
for (j in seq_len(ncol(lambda_promax))) {
    for (i in seq_len(nrow(lambda_promax))) {
        fac_complexity <- fac_complexity + sum(lambda_promax[i, j]^2 * lambda_promax[-i, j]^2)
    }
}
fac_complexity

[1] 3.806969

Target Rotation

In practice, we usually have some idea about the factor structure, and we can rotate the loadings to something close to that idea. This is an example of blurring the distinction between EFA and CFA.

Let’s say we think that the five O items are for one factor, and the five A items are for another.

# We want the first five items to load on factor 1 but not on factor 2,
# and the reverse for the next five items
target_mat <- matrix(
    c(NA, NA, NA, NA, NA, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, NA, NA, NA, NA, NA),
    nrow = 10
)
fa_target <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, fm = "pa",
    rotate = "targetQ", Target = target_mat)
fa_target

Factor Analysis using method =  pa
Call: psych::fa(r = select(bfi, O1:O5, A1:A5), nfactors = 2, rotate = "targetQ", 
    fm = "pa", Target = target_mat)
Standardized loadings (pattern matrix) based upon correlation matrix
     PA2   PA1    h2   u2 com
O1  0.06  0.51 0.280 0.72 1.0
O2  0.13 -0.49 0.223 0.78 1.1
O3  0.12  0.61 0.428 0.57 1.1
O4 -0.01  0.30 0.086 0.91 1.0
O5  0.06 -0.56 0.299 0.70 1.0
A1 -0.35 -0.04 0.128 0.87 1.0
A2  0.67  0.02 0.451 0.55 1.0
A3  0.75  0.01 0.571 0.43 1.0
A4  0.51 -0.10 0.245 0.75 1.1
A5  0.62  0.05 0.396 0.60 1.0

                       PA2  PA1
SS loadings           1.81 1.29
Proportion Var        0.18 0.13
Cumulative Var        0.18 0.31
Proportion Explained  0.58 0.42
Cumulative Proportion 0.58 1.00

 With factor correlations of 
     PA2  PA1
PA2 1.00 0.25
PA1 0.25 1.00

Mean item complexity =  1
Test of the hypothesis that 2 factors are sufficient.

df null model =  45  with the objective function =  1.58 with Chi Square =  4409.68
df of  the model are 26  and the objective function was  0.12 

The root mean square of the residuals (RMSR) is  0.04 
The df corrected root mean square of the residuals is  0.05 

The harmonic n.obs is  2766 with the empirical chi square  362.29  with prob <  6e-61 
The total n.obs was  2800  with Likelihood Chi Square =  323  with prob <  5.2e-53 

Tucker Lewis Index of factoring reliability =  0.882
RMSEA index =  0.064  and the 90 % confidence intervals are  0.058 0.07
BIC =  116.63
Fit based upon off diagonal values = 0.97
Measures of factor score adequacy             
                                                   PA2  PA1
Correlation of (regression) scores with factors   0.88 0.81
Multiple R square of scores with factors          0.77 0.66
Minimum correlation of possible factor scores     0.54 0.32

Percentage of Variance Accounted For

Let’s use oblimin solution

# Proportion of variance by the first two eigenvalues
ev_oa <- fa_oblimin$e.values
sum(ev_oa[1:2]) / sum(ev_oa)

[1] 0.4399516

The first two eigenvalues account for 44% of the total variance in the data.

Correlation Residual

This does not depend on rotation method.

round(fa_unrotated$residual, digits = 2)

     O1    O2    O3    O4    O5    A1    A2    A3    A4    A5
O1 0.72  0.02  0.05  0.02  0.04  0.10  0.00  0.00  0.02  0.02
O2 0.02  0.78  0.02  0.07  0.07  0.06  0.02  0.00 -0.01  0.02
O3 0.05  0.02  0.57  0.01  0.03  0.06 -0.03  0.01  0.00  0.04
O4 0.02  0.07  0.01  0.91 -0.02 -0.04  0.04 -0.01 -0.04 -0.04
O5 0.04  0.07  0.03 -0.02  0.70  0.07 -0.03  0.01  0.00  0.02
A1 0.10  0.06  0.06 -0.04  0.07  0.87 -0.10  0.00  0.02  0.04
A2 0.00  0.02 -0.03  0.04 -0.03 -0.10  0.55 -0.02  0.01 -0.03
A3 0.00  0.00  0.01 -0.01  0.01  0.00 -0.02  0.43 -0.01  0.03
A4 0.02 -0.01  0.00 -0.04  0.00  0.02  0.01 -0.01  0.75  0.01
A5 0.02  0.02  0.04 -0.04  0.02  0.04 -0.03  0.03  0.01  0.60

As a general rule of thumb, researchers should pay attention to residuals with absolute value > .10. In our case, there were a few values close to .10, but it doesn’t seem to be strong evidence that there may be an additional factor needed.

Confidence Intervals

The loadings in EFA are quite unstable, especially with rotation. We can obtain confidence intervals using bootstrap, which is available in psych::fa() by specifying

fa_oblimin_boot <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, 
    n.iter = 200, # number of bootstrap samples
    fm = "pa", rotate = "oblimin")
fa_oblimin_boot

Factor Analysis with confidence intervals using method = psych::fa(r = select(bfi, O1:O5, A1:A5), nfactors = 2, n.iter = 200, 
    rotate = "oblimin", fm = "pa")
Factor Analysis using method =  pa
Call: psych::fa(r = select(bfi, O1:O5, A1:A5), nfactors = 2, n.iter = 200, 
    rotate = "oblimin", fm = "pa")
Standardized loadings (pattern matrix) based upon correlation matrix
     PA1   PA2    h2   u2 com
O1  0.05  0.51 0.280 0.72 1.0
O2  0.14 -0.49 0.223 0.78 1.2
O3  0.11  0.62 0.428 0.57 1.1
O4 -0.02  0.30 0.086 0.91 1.0
O5  0.07 -0.56 0.299 0.70 1.0
A1 -0.35 -0.04 0.128 0.87 1.0
A2  0.67  0.01 0.451 0.55 1.0
A3  0.75  0.00 0.571 0.43 1.0
A4  0.51 -0.10 0.245 0.75 1.1
A5  0.62  0.05 0.396 0.60 1.0

                       PA1  PA2
SS loadings           1.81 1.30
Proportion Var        0.18 0.13
Cumulative Var        0.18 0.31
Proportion Explained  0.58 0.42
Cumulative Proportion 0.58 1.00

 With factor correlations of 
     PA1  PA2
PA1 1.00 0.26
PA2 0.26 1.00

Mean item complexity =  1
Test of the hypothesis that 2 factors are sufficient.

df null model =  45  with the objective function =  1.58 with Chi Square =  4409.68
df of  the model are 26  and the objective function was  0.12 

The root mean square of the residuals (RMSR) is  0.04 
The df corrected root mean square of the residuals is  0.05 

The harmonic n.obs is  2766 with the empirical chi square  362.29  with prob <  6e-61 
The total n.obs was  2800  with Likelihood Chi Square =  323  with prob <  5.2e-53 

Tucker Lewis Index of factoring reliability =  0.882
RMSEA index =  0.064  and the 90 % confidence intervals are  0.058 0.07
BIC =  116.63
Fit based upon off diagonal values = 0.97
Measures of factor score adequacy             
                                                   PA1  PA2
Correlation of (regression) scores with factors   0.88 0.81
Multiple R square of scores with factors          0.77 0.66
Minimum correlation of possible factor scores     0.54 0.33

 Coefficients and bootstrapped confidence intervals 
     low   PA1 upper   low   PA2 upper
O1  0.02  0.05  0.09  0.46  0.51  0.56
O2  0.10  0.14  0.17 -0.53 -0.49 -0.44
O3  0.08  0.11  0.15  0.57  0.62  0.67
O4 -0.05 -0.02  0.03  0.24  0.30  0.34
O5  0.03  0.07  0.11 -0.61 -0.56 -0.51
A1 -0.40 -0.35 -0.30 -0.08 -0.04  0.01
A2  0.63  0.67  0.70 -0.02  0.01  0.05
A3  0.72  0.75  0.79 -0.03  0.00  0.04
A4  0.47  0.51  0.56 -0.15 -0.10 -0.06
A5  0.58  0.62  0.65  0.01  0.05  0.09

 Interfactor correlations and bootstrapped confidence intervals 
        lower estimate upper
PA1-PA2  0.17     0.26  0.35

In practice, you should use more bootstrap samples (e.g., 1999). We can do the same for target rotation:

fa_target_boot <- psych::fa(
    select(bfi, O1:O5, A1:A5),
    nfactors = 2, 
    n.iter = 200, # number of bootstrap samples
    fm = "pa",
    rotate = "targetQ", Target = target_mat)
fa_target_boot

Factor Analysis with confidence intervals using method = psych::fa(r = select(bfi, O1:O5, A1:A5), nfactors = 2, n.iter = 200, 
    rotate = "targetQ", fm = "pa", Target = target_mat)
Factor Analysis using method =  pa
Call: psych::fa(r = select(bfi, O1:O5, A1:A5), nfactors = 2, n.iter = 200, 
    rotate = "targetQ", fm = "pa", Target = target_mat)
Standardized loadings (pattern matrix) based upon correlation matrix
     PA2   PA1    h2   u2 com
O1  0.06  0.51 0.280 0.72 1.0
O2  0.13 -0.49 0.223 0.78 1.1
O3  0.12  0.61 0.428 0.57 1.1
O4 -0.01  0.30 0.086 0.91 1.0
O5  0.06 -0.56 0.299 0.70 1.0
A1 -0.35 -0.04 0.128 0.87 1.0
A2  0.67  0.02 0.451 0.55 1.0
A3  0.75  0.01 0.571 0.43 1.0
A4  0.51 -0.10 0.245 0.75 1.1
A5  0.62  0.05 0.396 0.60 1.0

                       PA2  PA1
SS loadings           1.81 1.29
Proportion Var        0.18 0.13
Cumulative Var        0.18 0.31
Proportion Explained  0.58 0.42
Cumulative Proportion 0.58 1.00

 With factor correlations of 
     PA2  PA1
PA2 1.00 0.25
PA1 0.25 1.00

Mean item complexity =  1
Test of the hypothesis that 2 factors are sufficient.

df null model =  45  with the objective function =  1.58 with Chi Square =  4409.68
df of  the model are 26  and the objective function was  0.12 

The root mean square of the residuals (RMSR) is  0.04 
The df corrected root mean square of the residuals is  0.05 

The harmonic n.obs is  2766 with the empirical chi square  362.29  with prob <  6e-61 
The total n.obs was  2800  with Likelihood Chi Square =  323  with prob <  5.2e-53 

Tucker Lewis Index of factoring reliability =  0.882
RMSEA index =  0.064  and the 90 % confidence intervals are  0.058 0.07
BIC =  116.63
Fit based upon off diagonal values = 0.97
Measures of factor score adequacy             
                                                   PA2  PA1
Correlation of (regression) scores with factors   0.88 0.81
Multiple R square of scores with factors          0.77 0.66
Minimum correlation of possible factor scores     0.54 0.32

 Coefficients and bootstrapped confidence intervals 
     low   PA2 upper   low   PA1 upper
O1  0.02  0.06  0.10  0.47  0.51  0.56
O2  0.09  0.13  0.16 -0.53 -0.49 -0.44
O3  0.08  0.12  0.16  0.57  0.61  0.67
O4 -0.05 -0.01  0.03  0.25  0.30  0.34
O5  0.02  0.06  0.10 -0.61 -0.56 -0.51
A1 -0.40 -0.35 -0.29 -0.08 -0.04  0.01
A2  0.63  0.67  0.71 -0.02  0.02  0.05
A3  0.72  0.75  0.79 -0.03  0.01  0.04
A4  0.47  0.51  0.55 -0.14 -0.10 -0.06
A5  0.58  0.62  0.65  0.01  0.05  0.09

 Interfactor correlations and bootstrapped confidence intervals 
        lower estimate upper
PA2-PA1   0.2     0.25   0.3

Full BFI

Target rotation with BFI, with polychoric correlation

pcorr_bfi <- lavaan::lavCor(select(bfi, A1:O5), ordered = TRUE)
n_pcorr_bfi <- 
    select(bfi, A1:O5) |>
    drop_na() |>
    nrow()
# Parallel analysis
psych::fa.parallel(
    pcorr_bfi,
    n.obs = n_pcorr_bfi,
    fm = "pa",
    error.bars = TRUE
)

Parallel analysis suggests that the number of factors =  6  and the number of components =  5

# Other methods
nfactors(pcorr_bfi, n.obs = n_pcorr_bfi)


Number of factors
Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm, 
    n.obs = n.obs, plot = FALSE, title = title, use = use, cor = cor)
VSS complexity 1 achieves a maximimum of Although the vss.max shows  4  factors, it is probably more reasonable to think about  2  factors
VSS complexity 2 achieves a maximimum of 0.79  with  4  factors
The Velicer MAP achieves a minimum of 0.02  with  5  factors 
Empirical BIC achieves a minimum of  -711.98  with  8  factors
Sample Size adjusted BIC achieves a minimum of  -125.98  with  12  factors

Statistics by number of factors 
   vss1 vss2   map dof   chisq     prob sqresid  fit  RMSEA   BIC   SABIC
1  0.54 0.00 0.032 275 1.4e+04  0.0e+00    27.3 0.54 0.1438 11980 12853.7
2  0.59 0.68 0.026 251 9.3e+03  0.0e+00    18.8 0.68 0.1213  7295  8092.2
3  0.58 0.75 0.023 228 6.6e+03  0.0e+00    13.9 0.77 0.1074  4860  5584.3
4  0.62 0.79 0.019 206 4.4e+03  0.0e+00    10.1 0.83 0.0912  2774  3428.8
5  0.56 0.79 0.016 185 2.2e+03  0.0e+00     7.6 0.87 0.0673   783  1370.9
6  0.56 0.77 0.017 165 1.4e+03 1.7e-188     6.6 0.89 0.0547    82   606.2
7  0.56 0.74 0.021 146 9.6e+02 1.8e-120     6.2 0.90 0.0479  -175   288.7
8  0.52 0.71 0.023 128 7.1e+02  1.2e-80     5.7 0.90 0.0430  -293   114.0
9  0.52 0.69 0.028 111 5.2e+02  5.4e-54     5.5 0.91 0.0388  -347     6.1
10 0.52 0.68 0.033  95 3.9e+02  1.0e-37     5.0 0.92 0.0358  -349   -47.0
11 0.50 0.70 0.040  80 2.5e+02  4.7e-20     5.0 0.92 0.0299  -369  -115.2
12 0.48 0.67 0.048  66 1.8e+02  2.3e-12     4.7 0.92 0.0265  -336  -126.0
13 0.49 0.67 0.058  53 1.3e+02  3.2e-08     4.5 0.92 0.0242  -285  -116.3
14 0.47 0.68 0.066  41 9.0e+01  1.8e-05     4.2 0.93 0.0220  -230   -99.9
15 0.46 0.68 0.081  30 6.0e+01  9.3e-04     3.7 0.94 0.0202  -174   -78.7
16 0.44 0.68 0.097  20 2.4e+01  2.5e-01     3.3 0.94 0.0089  -132   -68.5
17 0.46 0.67 0.114  11 7.8e+00  7.3e-01     3.7 0.94 0.0000   -78   -43.0
18 0.43 0.63 0.134   3 4.5e-01  9.3e-01     3.5 0.94 0.0000   -23   -13.4
19 0.44 0.68 0.157  -4 3.8e-04       NA     3.7 0.94     NA    NA      NA
20 0.45 0.67 0.199 -10 8.4e-05       NA     3.7 0.94     NA    NA      NA
   complex  eChisq    SRMR  eCRMS  eBIC
1      1.0 2.7e+04 1.4e-01 0.1414 24627
2      1.2 1.4e+04 9.9e-02 0.1078 12266
3      1.3 8.2e+03 7.5e-02 0.0859  6414
4      1.4 4.0e+03 5.2e-02 0.0631  2392
5      1.5 1.3e+03 3.0e-02 0.0383  -121
6      1.7 6.6e+02 2.1e-02 0.0286  -631
7      1.8 4.4e+02 1.7e-02 0.0249  -697
8      1.9 2.9e+02 1.4e-02 0.0214  -712
9      2.0 2.1e+02 1.2e-02 0.0198  -653
10     2.0 1.6e+02 1.0e-02 0.0184  -585
11     2.1 9.7e+01 8.2e-03 0.0158  -526
12     2.2 6.4e+01 6.6e-03 0.0141  -451
13     2.2 4.3e+01 5.4e-03 0.0129  -370
14     2.2 2.7e+01 4.3e-03 0.0117  -292
15     2.1 1.7e+01 3.5e-03 0.0109  -216
16     2.0 7.6e+00 2.3e-03 0.0088  -148
17     2.3 2.6e+00 1.3e-03 0.0070   -83
18     2.1 1.0e-01 2.6e-04 0.0026   -23
19     2.3 7.5e-05 7.2e-06     NA    NA
20     2.3 1.8e-05 3.5e-06     NA    NA

# Target rotation
target_mat_bfi <- matrix(0, nrow = 25, ncol = 5)
target_mat_bfi[1:5, 1] <- NA
target_mat_bfi[6:10, 2] <- NA
target_mat_bfi[11:15, 3] <- NA
target_mat_bfi[16:20, 4] <- NA
target_mat_bfi[21:25, 5] <- NA
fa_target_bfi <- psych::fa(
    pcorr_bfi, n.obs = n_pcorr_bfi,
    nfactors = 5, fm = "pa",
    rotate = "targetQ", Target = target_mat_bfi)
fa_target_bfi

Factor Analysis using method =  pa
Call: psych::fa(r = pcorr_bfi, nfactors = 5, n.obs = n_pcorr_bfi, rotate = "targetQ", 
    fm = "pa", Target = target_mat_bfi)
Standardized loadings (pattern matrix) based upon correlation matrix
     PA4   PA3   PA2   PA1   PA5   h2   u2 com
A1  0.15  0.18  0.07 -0.51 -0.09 0.26 0.74 1.5
A2  0.03  0.05  0.07  0.69  0.03 0.54 0.46 1.0
A3  0.02  0.18  0.02  0.69  0.04 0.61 0.39 1.1
A4 -0.02  0.10  0.21  0.47 -0.18 0.35 0.65 1.9
A5 -0.11  0.28  0.00  0.55  0.04 0.53 0.47 1.6
C1  0.07 -0.02  0.59  0.00  0.17 0.40 0.60 1.2
C2  0.18 -0.05  0.71  0.06  0.06 0.51 0.49 1.2
C3  0.04 -0.08  0.61  0.10 -0.07 0.36 0.64 1.1
C4  0.17  0.03 -0.69  0.01 -0.03 0.55 0.45 1.1
C5  0.21 -0.10 -0.60  0.01  0.12 0.49 0.51 1.4
E1 -0.01 -0.62  0.13 -0.07 -0.02 0.39 0.61 1.1
E2  0.17 -0.71 -0.01 -0.04  0.01 0.61 0.39 1.1
E3  0.07  0.51 -0.02  0.22  0.27 0.50 0.50 2.0
E4 -0.05  0.64  0.00  0.27 -0.13 0.60 0.40 1.4
E5  0.13  0.50  0.28  0.01  0.17 0.46 0.54 2.0
N1  0.86  0.21  0.01 -0.19 -0.08 0.74 0.26 1.2
N2  0.81  0.14  0.02 -0.16  0.00 0.66 0.34 1.1
N3  0.77  0.00 -0.03  0.02  0.01 0.59 0.41 1.0
N4  0.56 -0.33 -0.13  0.08  0.13 0.55 0.45 1.9
N5  0.56 -0.17  0.00  0.20 -0.15 0.40 0.60 1.6
O1  0.00  0.18  0.06  0.00  0.55 0.39 0.61 1.3
O2  0.20  0.02 -0.08  0.15 -0.50 0.30 0.70 1.6
O3  0.02  0.27 -0.01  0.05  0.63 0.54 0.46 1.4
O4  0.19 -0.28 -0.04  0.19  0.48 0.35 0.65 2.4
O5  0.11  0.05 -0.04  0.04 -0.59 0.36 0.64 1.1

                       PA4  PA3  PA2  PA1  PA5
SS loadings           2.96 2.61 2.33 2.26 1.85
Proportion Var        0.12 0.10 0.09 0.09 0.07
Cumulative Var        0.12 0.22 0.32 0.41 0.48
Proportion Explained  0.25 0.22 0.19 0.19 0.15
Cumulative Proportion 0.25 0.46 0.66 0.85 1.00

 With factor correlations of 
      PA4   PA3   PA2   PA1  PA5
PA4  1.00 -0.21 -0.21 -0.09 0.02
PA3 -0.21  1.00  0.29  0.34 0.16
PA2 -0.21  0.29  1.00  0.23 0.20
PA1 -0.09  0.34  0.23  1.00 0.16
PA5  0.02  0.16  0.20  0.16 1.00

Mean item complexity =  1.4
Test of the hypothesis that 5 factors are sufficient.

df null model =  300  with the objective function =  9.59 with Chi Square =  23261.83
df of  the model are 185  and the objective function was  0.92 

The root mean square of the residuals (RMSR) is  0.03 
The df corrected root mean square of the residuals is  0.04 

The harmonic n.obs is  2436 with the empirical chi square  1322.13  with prob <  8.1e-171 
The total n.obs was  2436  with Likelihood Chi Square =  2226.01  with prob <  0 

Tucker Lewis Index of factoring reliability =  0.856
RMSEA index =  0.067  and the 90 % confidence intervals are  0.065 0.07
BIC =  783.36
Fit based upon off diagonal values = 0.98
Measures of factor score adequacy             
                                                   PA4  PA3  PA2  PA1  PA5
Correlation of (regression) scores with factors   0.94 0.92 0.90 0.90 0.87
Multiple R square of scores with factors          0.89 0.84 0.81 0.81 0.76
Minimum correlation of possible factor scores     0.77 0.68 0.63 0.62 0.52

APA Table

Openness and Agreeableness

fa_oblimin$loadings |>
    unclass() |>
    as.data.frame() |>
    tibble::rownames_to_column() |>
    flextable::flextable() |>
    bold(i = ~ abs(PA1) >= .30, j = ~ PA1) |>
    bold(i = ~ abs(PA2) >= .30, j = ~ PA2) |>
    set_formatter(
        PA1 = rmlead0,
        PA2 = rmlead0
    ) |>
    set_header_labels(values = c("item", "1", "2")) |>
    add_header_row(values = c("", "Factor loading"), colwidths = c(1, 2)) |>
    align(i = 1, align = "center", part = "header")

	Factor loading
item	1	2
O1	.05	.51
O2	.14	-.49
O3	.11	.62
O4	-.02	.30
O5	.07	-.56
A1	-.35	-.04
A2	.67	.01
A3	.75	.00
A4	.51	-.10
A5	.62	.05

25-item from IPIP

fa_target_bfi$loadings |>
    unclass() |>
    as.data.frame() |>
    tibble::rownames_to_column() |>
    flextable::flextable() |>
    bold(i = ~ abs(PA1) >= .30, j = ~ PA1) |>
    bold(i = ~ abs(PA2) >= .30, j = ~ PA2) |>
    bold(i = ~ abs(PA3) >= .30, j = ~ PA3) |>
    bold(i = ~ abs(PA4) >= .30, j = ~ PA4) |>
    bold(i = ~ abs(PA5) >= .30, j = ~ PA5) |>
    set_formatter(
        PA1 = rmlead0,
        PA2 = rmlead0,
        PA3 = rmlead0,
        PA4 = rmlead0,
        PA5 = rmlead0
    ) |>
    set_header_labels(values = c("item", "1", "2", "3", "4", "5")) |>
    add_header_row(values = c("", "Factor loading"), colwidths = c(1, 5)) |>
    align(i = 1, align = "center", part = "header")

	Factor loading
item	1	2	3	4	5
A1	.15	.18	.07	-.51	-.09
A2	.03	.05	.07	.69	.03
A3	.02	.18	.02	.69	.04
A4	-.02	.10	.21	.47	-.18
A5	-.11	.28	-.00	.55	.04
C1	.07	-.02	.59	-.00	.17
C2	.18	-.05	.71	.06	.06
C3	.04	-.08	.61	.10	-.07
C4	.17	.03	-.69	.01	-.03
C5	.21	-.10	-.60	.01	.12
E1	-.01	-.62	.13	-.07	-.02
E2	.17	-.71	-.01	-.04	.01
E3	.07	.51	-.02	.22	.27
E4	-.05	.64	-.00	.27	-.13
E5	.13	.50	.28	.01	.17
N1	.86	.21	.01	-.19	-.08
N2	.81	.14	.02	-.16	.00
N3	.77	.00	-.03	.02	.01
N4	.56	-.33	-.13	.08	.13
N5	.56	-.17	-.00	.20	-.15
O1	-.00	.18	.06	-.00	.55
O2	.20	.02	-.08	.15	-.50
O3	.02	.27	-.01	.05	.63
O4	.19	-.28	-.04	.19	.48
O5	.11	.05	-.04	.04	-.59