Exercise 12

Author

Instructor of PSYC 573

library(ggplot2)

Consider the following simulated data with $N = 500$, with simulated correlation between X and Y being .7

set.seed(1648)
sample_size <- 500
# Simulate X and Y
full_data <-
    MASS::mvrnorm(sample_size,
        mu = c(0, 0.5),
        Sigma = matrix(c(1, .7, .7, 1), nrow = 2), empirical = TRUE
    )
full_data <- data.frame(full_data)
colnames(full_data) <- c("x", "y")
# Check plot
qplot(y, x, data = full_data)
# Check regression slope
summary(lm(y ~ x, data = full_data))

Q1

The following code randomly deletes 50% of the data on $y$. Run a regression analysis on just the observed data ($N = 250$), and see whether the slope is very different from the original slope of 0.7.

set.seed(1648)
# Generate missing data indicator
dy <- sample(sample_size, size = 250)
# Remaining data
mcar_data <- full_data
mcar_data$y[dy] <- NA
# Check plot
ggplot(full_data, aes(x = x, y = y)) +
    geom_point(col = "grey", alpha = 0.1) +
    geom_point(data = mcar_data)

# Insert code for getting the regression slope using `mcar_data`

Q2

One thing you should NEVER do is mean imputation. Use the code below, find the slope using data with mean imputations.

mcar_data_filled <- mcar_data
mcar_data_filled$y[is.na(mcar_data_filled$y)] <-
    mean(mcar_data_filled$y, na.rm = TRUE)
# Check plot
ggplot(full_data, aes(x = x, y = y)) +
    geom_point(col = "grey", alpha = 0.1) +
    geom_point(data = mcar_data) +
    # Add mean-imputed points
    geom_point(
        data = mcar_data[which(is.na(mcar_data$y)), ],
        aes(y = mean(mcar_data$y, na.rm = TRUE)), col = "red"
    )

# Insert code for getting the regression slope using `mcar_data_filled`

Q3

Based on the plot above, why do you think mean imputation is a bad practice?

Q4

Although listwise deletion, also called complete case analysis (default in R), generally is better than mean imputation (the worst), it is only valid when data are missing completely at random (and some missing at random situations). Find the slope with the following scenario, and comment on whether the sample slope estimate is biased.

set.seed(1648)
# Generate missing data indicator
dy <- which(full_data$y > median(full_data$y))
# Remaining data
mnar_data <- full_data
mnar_data$y[dy] <- NA
# Check plot
ggplot(full_data, aes(x = x, y = y)) +
    geom_point(col = "grey", alpha = 0.1) +
    geom_point(data = mnar_data)

# Insert code for getting the regression slope using `mnar_data`

Q5

Mean imputation may also be problematic when you use mean scores on a multi-item scale, if the missing data happens on items with low or high difficulty levels. The following generates such an example with three scale items, with missing data on item 3. Comment on how the slope is affected by using the mean of the first two items for those missing item 3.

# Original data
set.seed(1648)
sample_size <- 500
# Simulate X and Y
scale_data <-
    MASS::mvrnorm(sample_size,
        mu = c(0, 0, 2, 0.5),
        Sigma = matrix(c(
            1, .9, .9, .567,
            .9, 1, .9, .567,
            .9, .9, 1, .567,
            .567, .567, .567, 1
        ), nrow = 4),
        empirical = TRUE
    )
scale_data <- data.frame(scale_data)
colnames(scale_data) <- c("item1", "item2", "item3", "y")
# Compute the person means
scale_data$mean_x <- rowMeans(scale_data[c("item1", "item2", "item3")])
# Check plot
qplot(y, mean_x, data = scale_data)
# Check original regression slope
summary(lm(y ~ mean_x, data = scale_data))

set.seed(1648)
# With missing data on Item 3
# Generate missing data indicator
ditem3 <- sample(sample_size, size = 250)
mcar_scale_data <- scale_data
mcar_scale_data$item3[ditem3] <- NA
# Compute scale mean using available data
mcar_scale_filled <- mcar_scale_data
mcar_scale_filled$mean_x <-
    rowMeans(mcar_scale_filled[c("item1", "item2", "item3")], na.rm = TRUE)
# Check plot
ggplot(scale_data, aes(x = mean_x, y = y)) +
    geom_point(col = "grey", alpha = 0.1) +
    geom_point(data = mcar_scale_data) +
    # Add mean-imputed points
    geom_point(data = mcar_scale_filled[ditem3, ], col = "red")
# Check new regression slope
summary(lm(y ~ mean_x, data = mcar_scale_filled))

--- title: "Exercise 12" author: "Instructor of PSYC 573" eval: false --- ```{r} #| message: false library(ggplot2) ``` Consider the following simulated data with $N = 500$, with simulated correlation between X and Y being .7 ```{r} set.seed(1648) sample_size <- 500 # Simulate X and Y full_data <- MASS::mvrnorm(sample_size, mu = c(0, 0.5), Sigma = matrix(c(1, .7, .7, 1), nrow = 2), empirical = TRUE ) full_data <- data.frame(full_data) colnames(full_data) <- c("x", "y") # Check plot qplot(y, x, data = full_data) # Check regression slope summary(lm(y ~ x, data = full_data)) ``` ## Q1 The following code randomly deletes 50% of the data on $y$. Run a regression analysis on just the observed data ($N = 250$), and see whether the slope is very different from the original slope of 0.7. ```{r} set.seed(1648) # Generate missing data indicator dy <- sample(sample_size, size = 250) # Remaining data mcar_data <- full_data mcar_data$y[dy] <- NA # Check plot ggplot(full_data, aes(x = x, y = y)) + geom_point(col = "grey", alpha = 0.1) + geom_point(data = mcar_data) ``` ```{r} # Insert code for getting the regression slope using `mcar_data` ``` ## Q2 One thing you should **NEVER** do is mean imputation. Use the code below, find the slope using data with mean imputations. ```{r} mcar_data_filled <- mcar_data mcar_data_filled$y[is.na(mcar_data_filled$y)] <- mean(mcar_data_filled$y, na.rm = TRUE) # Check plot ggplot(full_data, aes(x = x, y = y)) + geom_point(col = "grey", alpha = 0.1) + geom_point(data = mcar_data) + # Add mean-imputed points geom_point( data = mcar_data[which(is.na(mcar_data$y)), ], aes(y = mean(mcar_data$y, na.rm = TRUE)), col = "red" ) ``` ```{r} # Insert code for getting the regression slope using `mcar_data_filled` ``` ## Q3 Based on the plot above, why do you think mean imputation is a bad practice? ## Q4 Although listwise deletion, also called complete case analysis (default in R), generally is better than mean imputation (the worst), it is only valid when data are missing completely at random (and some missing at random situations). Find the slope with the following scenario, and comment on whether the sample slope estimate is biased. ```{r} set.seed(1648) # Generate missing data indicator dy <- which(full_data$y > median(full_data$y)) # Remaining data mnar_data <- full_data mnar_data$y[dy] <- NA # Check plot ggplot(full_data, aes(x = x, y = y)) + geom_point(col = "grey", alpha = 0.1) + geom_point(data = mnar_data) ``` ```{r} # Insert code for getting the regression slope using `mnar_data` ``` ## Q5 Mean imputation may also be problematic when you use mean scores on a multi-item scale, if the missing data happens on items with low or high difficulty levels. The following generates such an example with three scale items, with missing data on item 3. Comment on how the slope is affected by using the mean of the first two items for those missing item 3. ```{r} # Original data set.seed(1648) sample_size <- 500 # Simulate X and Y scale_data <- MASS::mvrnorm(sample_size, mu = c(0, 0, 2, 0.5), Sigma = matrix(c( 1, .9, .9, .567, .9, 1, .9, .567, .9, .9, 1, .567, .567, .567, .567, 1 ), nrow = 4), empirical = TRUE ) scale_data <- data.frame(scale_data) colnames(scale_data) <- c("item1", "item2", "item3", "y") # Compute the person means scale_data$mean_x <- rowMeans(scale_data[c("item1", "item2", "item3")]) # Check plot qplot(y, mean_x, data = scale_data) # Check original regression slope summary(lm(y ~ mean_x, data = scale_data)) ``` ```{r} set.seed(1648) # With missing data on Item 3 # Generate missing data indicator ditem3 <- sample(sample_size, size = 250) mcar_scale_data <- scale_data mcar_scale_data$item3[ditem3] <- NA # Compute scale mean using available data mcar_scale_filled <- mcar_scale_data mcar_scale_filled$mean_x <- rowMeans(mcar_scale_filled[c("item1", "item2", "item3")], na.rm = TRUE) # Check plot ggplot(scale_data, aes(x = mean_x, y = y)) + geom_point(col = "grey", alpha = 0.1) + geom_point(data = mcar_scale_data) + # Add mean-imputed points geom_point(data = mcar_scale_filled[ditem3, ], col = "red") # Check new regression slope summary(lm(y ~ mean_x, data = mcar_scale_filled)) ```