Chapter 15 Missings - fehlende Werte

Missing Data werden in R durch NA (not available) repräsentiert. NA ist ein logischer Vector mit Länge 1. Normalerweise wird das Ergebnis einer Operation, in der NA vorkommen ebenfalls auf NA gesetzt. Viele Funktionen haben Parameter, die den Umgang mit NA festlegen. Für logische Operationen gibt es den Befehl is.na bzw. !is.na (not is NA, logische Prüfung auf nicht-fehlend)

15.1 Examples

# get a base idea
dd <- data.frame(x = c(1, 2, 3), y = c(0, 10, NA))
dd
##   x  y
## 1 1  0
## 2 2 10
## 3 3 NA
is.na(dd)
##          x     y
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE  TRUE
!is.na(dd)
##         x     y
## [1,] TRUE  TRUE
## [2,] TRUE  TRUE
## [3,] TRUE FALSE
# we can define a new column an preset it with NA
dd$new <- NA
# then overwrite it
dd$new <- 1:nrow(dd)
# we cam set a specific value to NA
dd$new[2] <- NA

# create sample dataframe
dd <-data.frame(
  subj   = c(  1,   2,   3,   4,   5,   6,   7,   8,   9),
  uni    = c(  1,   1,   1,   2,   2,   2,   3,   3,   3),
  grade1 = c(1.0,  NA, 3.7, 1.3,  NA, 1.0, 3.3, 4.0,  NA),
  grade2 = c(4.0, 3.0, 1.3, 1.3, 1.0, 1.3, 2.7, 4.0, 3.3),
  grade3 = c(1.3,  NA, 2.7, 1.0, 1.3, 1.3, 2.3, 3.7, 3.0)
)
# we get a T/F vector
is.na(dd)
##        subj   uni grade1 grade2 grade3
##  [1,] FALSE FALSE  FALSE  FALSE  FALSE
##  [2,] FALSE FALSE   TRUE  FALSE   TRUE
##  [3,] FALSE FALSE  FALSE  FALSE  FALSE
##  [4,] FALSE FALSE  FALSE  FALSE  FALSE
##  [5,] FALSE FALSE   TRUE  FALSE  FALSE
##  [6,] FALSE FALSE  FALSE  FALSE  FALSE
##  [7,] FALSE FALSE  FALSE  FALSE  FALSE
##  [8,] FALSE FALSE  FALSE  FALSE  FALSE
##  [9,] FALSE FALSE   TRUE  FALSE  FALSE
length(is.na(dd))
## [1] 45
length(is.na(dd)[is.na(dd) == T])
## [1] 4
# or using which()
which(is.na(dd))
## [1] 20 23 27 38
# and count it
length(which(is.na(dd)))
## [1] 4
# how about missings in a column
length(which(is.na(dd$grade1)))
## [1] 3
# or in a single row
length(which(is.na(dd[2,])))
## [1] 2
# how many missings in grades per subject
apply(dd[,c("grade1", "grade2", "grade3")], 1, function(x) length(which(is.na(x))))
## [1] 0 2 0 0 1 0 0 0 1
# a column with n of missings
dd$n_miss <- apply(dd[,c("grade1", "grade2", "grade3")], 1, function(x) length(which(is.na(x))))
# delete all subjects with missing data in any column
na.omit(dd)
##   subj uni grade1 grade2 grade3 n_miss
## 1    1   1    1.0    4.0    1.3      0
## 3    3   1    3.7    1.3    2.7      0
## 4    4   2    1.3    1.3    1.0      0
## 6    6   2    1.0    1.3    1.3      0
## 7    7   3    3.3    2.7    2.3      0
## 8    8   3    4.0    4.0    3.7      0

15.2 NA in functions

dd <-data.frame(
  subj   = c(  1,   2,   3,   4,   5,   6,   7,   8,   9),
  uni    = c(  1,   1,   1,   2,   2,   2,   3,   3,   3),
  grade1 = c(1.0,  NA, 3.7, 1.3,  NA, 1.0, 3.3, 4.0,  NA),
  grade2 = c(4.0, 3.0, 1.3, 1.3, 1.0, 1.3, 2.7, 4.0, 3.3),
  grade3 = c(1.3,  NA, 2.7, 1.0, 1.3, 1.3, 2.3, 3.7, 3.0)
)
# NAs and mean()
mean(dd$grade1) # has NA so the result is NA
## [1] NA
mean(dd$grade1, na.rm = T) # works and includes all valid values
## [1] 2.383333
# NAs and lm()
lm(dd$grade1 ~ dd$grade2) # works, because na.action is set to na.omit by default
## 
## Call:
## lm(formula = dd$grade1 ~ dd$grade2)
## 
## Coefficients:
## (Intercept)    dd$grade2  
##      1.8563       0.2166
# lm(dd$grade1 ~ dd$grade2, na.action = na.fail) # commented out because it fails, we have NAs in the columns used

# NAs and gls()
# nlme::gls(grade1 ~ grade2 + grade3, data=dd)  # commented out because it fails, default na.action is fail
nlme::gls(grade1 ~ grade2 + grade3, data=dd, na.action = na.omit)  # this works on base of all non missing observations
## Generalized least squares fit by REML
##   Model: grade1 ~ grade2 + grade3 
##   Data: dd 
##   Log-restricted-likelihood: -5.187343
## 
## Coefficients:
## (Intercept)      grade2      grade3 
##  0.06439277 -0.22253135  1.39533342 
## 
## Degrees of freedom: 6 total; 3 residual
## Residual standard error: 0.5463859

15.3 Referenzen

15.4 Screencast(s)