Chapter 15 Missings - fehlende Werte
Missing Data werden in R durch NA
(not available) repräsentiert.
NA
ist ein logischer Vector mit Länge 1.
Normalerweise wird das Ergebnis einer Operation, in der NA
vorkommen ebenfalls auf NA
gesetzt.
Viele Funktionen haben Parameter, die den Umgang mit NA
festlegen.
Für logische Operationen gibt es den Befehl is.na
bzw. !is.na
(not is NA, logische Prüfung auf nicht-fehlend)
15.1 Examples
# get a base idea
<- data.frame(x = c(1, 2, 3), y = c(0, 10, NA))
dd dd
## x y
## 1 1 0
## 2 2 10
## 3 3 NA
is.na(dd)
## x y
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE TRUE
!is.na(dd)
## x y
## [1,] TRUE TRUE
## [2,] TRUE TRUE
## [3,] TRUE FALSE
# we can define a new column an preset it with NA
$new <- NA
dd# then overwrite it
$new <- 1:nrow(dd)
dd# we cam set a specific value to NA
$new[2] <- NA
dd
# create sample dataframe
<-data.frame(
dd subj = c( 1, 2, 3, 4, 5, 6, 7, 8, 9),
uni = c( 1, 1, 1, 2, 2, 2, 3, 3, 3),
grade1 = c(1.0, NA, 3.7, 1.3, NA, 1.0, 3.3, 4.0, NA),
grade2 = c(4.0, 3.0, 1.3, 1.3, 1.0, 1.3, 2.7, 4.0, 3.3),
grade3 = c(1.3, NA, 2.7, 1.0, 1.3, 1.3, 2.3, 3.7, 3.0)
)# we get a T/F vector
is.na(dd)
## subj uni grade1 grade2 grade3
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE TRUE FALSE TRUE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE TRUE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE FALSE FALSE
## [9,] FALSE FALSE TRUE FALSE FALSE
length(is.na(dd))
## [1] 45
length(is.na(dd)[is.na(dd) == T])
## [1] 4
# or using which()
which(is.na(dd))
## [1] 20 23 27 38
# and count it
length(which(is.na(dd)))
## [1] 4
# how about missings in a column
length(which(is.na(dd$grade1)))
## [1] 3
# or in a single row
length(which(is.na(dd[2,])))
## [1] 2
# how many missings in grades per subject
apply(dd[,c("grade1", "grade2", "grade3")], 1, function(x) length(which(is.na(x))))
## [1] 0 2 0 0 1 0 0 0 1
# a column with n of missings
$n_miss <- apply(dd[,c("grade1", "grade2", "grade3")], 1, function(x) length(which(is.na(x))))
dd# delete all subjects with missing data in any column
na.omit(dd)
## subj uni grade1 grade2 grade3 n_miss
## 1 1 1 1.0 4.0 1.3 0
## 3 3 1 3.7 1.3 2.7 0
## 4 4 2 1.3 1.3 1.0 0
## 6 6 2 1.0 1.3 1.3 0
## 7 7 3 3.3 2.7 2.3 0
## 8 8 3 4.0 4.0 3.7 0
15.2 NA in functions
<-data.frame(
dd subj = c( 1, 2, 3, 4, 5, 6, 7, 8, 9),
uni = c( 1, 1, 1, 2, 2, 2, 3, 3, 3),
grade1 = c(1.0, NA, 3.7, 1.3, NA, 1.0, 3.3, 4.0, NA),
grade2 = c(4.0, 3.0, 1.3, 1.3, 1.0, 1.3, 2.7, 4.0, 3.3),
grade3 = c(1.3, NA, 2.7, 1.0, 1.3, 1.3, 2.3, 3.7, 3.0)
)# NAs and mean()
mean(dd$grade1) # has NA so the result is NA
## [1] NA
mean(dd$grade1, na.rm = T) # works and includes all valid values
## [1] 2.383333
# NAs and lm()
lm(dd$grade1 ~ dd$grade2) # works, because na.action is set to na.omit by default
##
## Call:
## lm(formula = dd$grade1 ~ dd$grade2)
##
## Coefficients:
## (Intercept) dd$grade2
## 1.8563 0.2166
# lm(dd$grade1 ~ dd$grade2, na.action = na.fail) # commented out because it fails, we have NAs in the columns used
# NAs and gls()
# nlme::gls(grade1 ~ grade2 + grade3, data=dd) # commented out because it fails, default na.action is fail
::gls(grade1 ~ grade2 + grade3, data=dd, na.action = na.omit) # this works on base of all non missing observations nlme
## Generalized least squares fit by REML
## Model: grade1 ~ grade2 + grade3
## Data: dd
## Log-restricted-likelihood: -5.187343
##
## Coefficients:
## (Intercept) grade2 grade3
## 0.06439277 -0.22253135 1.39533342
##
## Degrees of freedom: 6 total; 3 residual
## Residual standard error: 0.5463859
15.3 Referenzen
Beispiele und Erklärungen: Unit miss