Function returns univariate data summaries for each variable supplied, however
discrete and continuous variables are treated separately. Structure provides
a more pipe-friendly API for selecting and subsetting variables using the
dplyr syntax, however conditional statistics are evaluated
internally using the by function. Quantitative/continuous variable
information is kept distinct in the output, while discrete variables (e.g.,
factors and character vectors)
can be returned by using the discrete argument.
descript(df, funs = get_descriptFuns(), discrete = FALSE)
get_descriptFuns()a data.frame or tibble-like structure
containing the variables of interest.
Note that factor and character vectors will be treated as
discrete observations, and by default are omitted from the computation
of the quantitative descriptive statistics specified in funs. However,
setting discrete = TRUE will provide count-type information for these
discrete variables, in which case arguments to funs are ignored
functions to apply when discrete = FALSE. Can be modified
by the user to include or exclude further functions, however each supplied
function must return a scalar. Use get_discreteFuns() to return
the full list of functions, which may then be augmented or subsetted
based on the user's requirements. Default descriptive statistic returned are:
nnumber of non-missing observations
missnumber of missing observations
meanmean
trimmedtrimmed mean (10%)
sdstandard deviation
madmean absolute deviation
skewnessskewness (from e1701)
kurtosiskurtosis (from e1071)
minminimum
Q_2525% quantile
Q_5050% quantile (a.k.a., the median)
Q_7575% quantile
maxmaximum
Note that by default the na.rm behavior is set to TRUE
in each function call
logical; include summary statistics for discrete
variables only? If TRUE then only count and proportion
information for the discrete variables will be returned
Conditioning: As the function is intended to support
pipe-friendly code specifications, conditioning/group subset
specifications are declared using group_by
and subsequently passed to descript.
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
data(mtcars)
if(FALSE){
# run the following to see behavior with NA values in dataset
mtcars[sample(1:nrow(mtcars), 3), 'cyl'] <- NA
mtcars[sample(1:nrow(mtcars), 5), 'mpg'] <- NA
}
fmtcars <- within(mtcars, {
cyl <- factor(cyl)
am <- factor(am, labels=c('automatic', 'manual'))
vs <- factor(vs)
})
# with and without factor variables
mtcars |> descript()
#> # A tibble: 11 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 32 NA 20.1 19.7 6.03 5.41 0.611 -0.373 10.4
#> 2 cyl 32 NA 6.19 6.23 1.79 2.97 -0.175 -1.76 4
#> 3 disp 32 NA 231. 223. 124. 140. 0.382 -1.21 71.1
#> 4 hp 32 NA 147. 141. 68.6 77.1 0.726 -0.136 52
#> 5 drat 32 NA 3.60 3.58 0.535 0.704 0.266 -0.715 2.76
#> 6 wt 32 NA 3.22 3.15 0.978 0.767 0.423 -0.0227 1.51
#> 7 qsec 32 NA 17.8 17.8 1.79 1.42 0.369 0.335 14.5
#> 8 vs 32 NA 0.438 0.423 0.504 0 0.240 -2.00 0
#> 9 am 32 NA 0.406 0.385 0.499 0 0.364 -1.92 0
#> 10 gear 32 NA 3.69 3.62 0.738 1.48 0.529 -1.07 3
#> 11 carb 32 NA 2.81 2.65 1.62 1.48 1.05 1.26 1
#> # ℹ 4 more variables: Q_25 <dbl>, Q_50 <dbl>, Q_75 <dbl>, max <dbl>
fmtcars |> descript() # factors/discrete vars omitted
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 32 NA 20.1 19.7 6.03 5.41 0.611 -0.373 10.4
#> 2 disp 32 NA 231. 223. 124. 140. 0.382 -1.21 71.1
#> 3 hp 32 NA 147. 141. 68.6 77.1 0.726 -0.136 52
#> 4 drat 32 NA 3.60 3.58 0.535 0.704 0.266 -0.715 2.76
#> 5 wt 32 NA 3.22 3.15 0.978 0.767 0.423 -0.0227 1.51
#> 6 qsec 32 NA 17.8 17.8 1.79 1.42 0.369 0.335 14.5
#> 7 gear 32 NA 3.69 3.62 0.738 1.48 0.529 -1.07 3
#> 8 carb 32 NA 2.81 2.65 1.62 1.48 1.05 1.26 1
#> # ℹ 4 more variables: Q_25 <dbl>, Q_50 <dbl>, Q_75 <dbl>, max <dbl>
fmtcars |> descript(discrete=TRUE) # discrete variables only
#> $cyl
#> # A tibble: 3 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 4 11 0.344
#> 2 6 7 0.219
#> 3 8 14 0.438
#>
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 18 0.562
#> 2 1 14 0.438
#>
#> $am
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 automatic 19 0.594
#> 2 manual 13 0.406
#>
# usual pipe chaining
fmtcars |> select(mpg, wt) |> descript()
#> # A tibble: 2 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 32 NA 20.1 19.7 6.03 5.41 0.611 -0.373 10.4 15.4
#> 2 wt 32 NA 3.22 3.15 0.978 0.767 0.423 -0.0227 1.51 2.58
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
fmtcars |> filter(mpg > 20) |> select(mpg, wt) |> descript()
#> # A tibble: 2 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 14 NA 25.5 25.2 4.60 3.71 0.553 -1.38 21 21.4
#> 2 wt 14 NA 2.42 2.43 0.577 0.697 -0.0349 -1.47 1.51 1.99
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
# conditioning with group_by()
fmtcars |> group_by(cyl) |> descript()
#> cyl: 4
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 11 NA 26.7 26.4 4.51 6.52 0.259 -1.65 21.4 22.8
#> 2 disp 11 NA 105. 104. 26.9 43.0 0.121 -1.64 71.1 78.8
#> 3 hp 11 NA 82.6 82.7 20.9 32.6 0.00626 -1.71 52 65.5
#> 4 drat 11 NA 4.07 4.02 0.365 0.341 0.998 0.123 3.69 3.81
#> 5 wt 11 NA 2.29 2.27 0.570 0.541 0.300 -1.36 1.51 1.88
#> 6 qsec 11 NA 19.1 19.0 1.68 1.48 0.550 -0.0207 16.7 18.6
#> 7 gear 11 NA 4.09 4.11 0.539 0 0.115 -0.0106 3 4
#> 8 carb 11 NA 1.55 1.56 0.522 0 -0.158 -2.15 1 1
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 6
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 7 NA 19.7 19.7 1.45 1.93 -0.158 -1.91 17.8
#> 2 disp 7 NA 183. 183. 41.6 11.3 0.795 -1.23 145
#> 3 hp 7 NA 122. 122. 24.3 7.41 1.36 0.249 105
#> 4 drat 7 NA 3.59 3.59 0.476 0.0297 -0.736 -1.40 2.76
#> 5 wt 7 NA 3.12 3.12 0.356 0.363 -0.222 -1.98 2.62
#> 6 qsec 7 NA 18.0 18.0 1.71 1.90 -0.125 -1.75 15.5
#> 7 gear 7 NA 3.86 3.86 0.690 0 0.106 -1.24 3
#> 8 carb 7 NA 3.43 3.43 1.81 0 -0.261 -1.50 1
#> # ℹ 4 more variables: Q_25 <dbl>, Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 8
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 14 NA 15.1 15.2 2.56 1.56 -0.363 -0.566 10.4 14.4
#> 2 disp 14 NA 353. 350. 67.8 73.4 0.453 -1.26 276. 302.
#> 3 hp 14 NA 209. 204. 51.0 44.5 0.909 0.0921 150 176.
#> 4 drat 14 NA 3.23 3.19 0.372 0.156 1.34 1.08 2.76 3.07
#> 5 wt 14 NA 4.00 3.95 0.759 0.408 0.988 -0.713 3.17 3.53
#> 6 qsec 14 NA 16.8 16.9 1.20 0.793 -0.805 -0.919 14.5 16.1
#> 7 gear 14 NA 3.29 3.17 0.726 0 1.83 1.45 3 3
#> 8 carb 14 NA 3.5 3.25 1.56 0.741 1.48 2.24 2 2.25
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
fmtcars |> group_by(cyl, am) |> descript()
#> cyl: 4
#> am: automatic
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 3 NA 22.9 22.9 1.45 1.93 0.0685 -2.33 21.5 22.2
#> 2 disp 3 NA 136. 136. 14.0 8.75 -0.309 -2.33 120. 130.
#> 3 hp 3 NA 84.7 84.7 19.7 2.97 -0.380 -2.33 62 78.5
#> 4 drat 3 NA 3.77 3.77 0.13 0.0148 0.382 -2.33 3.69 3.70
#> 5 wt 3 NA 2.94 2.94 0.408 0.0593 -0.381 -2.33 2.46 2.81
#> 6 qsec 3 NA 21.0 21.0 1.67 0.0148 0.385 -2.33 20 20.0
#> 7 gear 3 NA 3.67 3.67 0.577 0 -0.385 -2.33 3 3.5
#> 8 carb 3 NA 1.67 1.67 0.577 0 -0.385 -2.33 1 1.5
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 6
#> am: automatic
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 4 NA 19.1 19.1 1.63 1.04 0.482 -1.91 17.8
#> 2 disp 4 NA 205. 205. 44.7 42.6 0.168 -2.25 168.
#> 3 hp 4 NA 115. 115. 9.18 9.64 -0.0940 -2.33 105
#> 4 drat 4 NA 3.42 3.42 0.592 0.623 -0.0926 -2.33 2.76
#> 5 wt 4 NA 3.39 3.39 0.116 0.0148 -0.735 -1.70 3.22
#> 6 qsec 4 NA 19.2 19.2 0.816 0.845 0.105 -2.02 18.3
#> 7 gear 4 NA 3.5 3.5 0.577 0.741 0 -2.44 3
#> 8 carb 4 NA 2.5 2.5 1.73 2.22 0 -2.44 1
#> # ℹ 4 more variables: Q_25 <dbl>, Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 8
#> am: automatic
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 12 NA 15.0 15.1 2.77 2.30 -0.284 -0.964 10.4 14.0
#> 2 disp 12 NA 358. 354. 71.8 96.5 0.303 -1.51 276. 297.
#> 3 hp 12 NA 194. 194. 33.4 40.8 0.279 -1.44 150 175
#> 4 drat 12 NA 3.12 3.10 0.230 0.111 1.17 1.64 2.76 3.05
#> 5 wt 12 NA 4.10 4.04 0.768 0.408 0.854 -1.14 3.44 3.56
#> 6 qsec 12 NA 17.1 17.2 0.802 0.593 -0.933 -0.338 15.4 17.0
#> 7 gear 12 NA 3 3 0 0 NaN NaN 3 3
#> 8 carb 12 NA 3.08 3.1 0.900 1.48 -0.141 -1.85 2 2
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 4
#> am: manual
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 8 NA 28.1 28.1 4.48 4.74 -0.208 -1.66 21.4 25.2
#> 2 disp 8 NA 93.6 93.6 20.5 20.2 0.276 -1.89 71.1 78.0
#> 3 hp 8 NA 81.9 81.9 22.7 20.8 0.137 -1.81 52 65.8
#> 4 drat 8 NA 4.18 4.18 0.364 0.274 0.828 -0.472 3.77 4.02
#> 5 wt 8 NA 2.04 2.04 0.409 0.360 0.349 -1.15 1.51 1.78
#> 6 qsec 8 NA 18.4 18.4 1.13 0.860 -0.428 -1.39 16.7 18.1
#> 7 gear 8 NA 4.25 4.25 0.463 0 0.945 -1.21 4 4
#> 8 carb 8 NA 1.5 1.5 0.535 0.741 0 -2.23 1 1
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 6
#> am: manual
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min Q_25
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 3 NA 20.6 20.6 0.751 0 -0.385 -2.33 19.7 20.4
#> 2 disp 3 NA 155 155 8.66 0 -0.385 -2.33 145 152.
#> 3 hp 3 NA 132. 132. 37.5 0 0.385 -2.33 110 110
#> 4 drat 3 NA 3.81 3.81 0.162 0 -0.385 -2.33 3.62 3.76
#> 5 wt 3 NA 2.76 2.76 0.128 0.156 -0.115 -2.33 2.62 2.70
#> 6 qsec 3 NA 16.3 16.3 0.769 0.830 -0.168 -2.33 15.5 16.0
#> 7 gear 3 NA 4.33 4.33 0.577 0 0.385 -2.33 4 4
#> 8 carb 3 NA 4.67 4.67 1.15 0 0.385 -2.33 4 4
#> # ℹ 3 more variables: Q_50 <dbl>, Q_75 <dbl>, max <dbl>
#> ------------------------------------------------------------
#> cyl: 8
#> am: manual
#> # A tibble: 8 × 14
#> VARS n miss mean trimmed sd mad skewness kurtosis min
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 2 NA 15.4 15.4 0.566 0.593 0 -2.75 15
#> 2 disp 2 NA 326 326 35.4 37.1 0 -2.75 301
#> 3 hp 2 NA 300. 300. 50.2 52.6 0 -2.75 264
#> 4 drat 2 NA 3.88 3.88 0.481 0.504 0 -2.75 3.54
#> 5 wt 2 NA 3.37 3.37 0.283 0.297 -1.19e-15 -2.75 3.17
#> 6 qsec 2 NA 14.6 14.6 0.0707 0.0741 -1.89e-14 -2.75 14.5
#> 7 gear 2 NA 5 5 0 0 NaN NaN 5
#> 8 carb 2 NA 6 6 2.83 2.97 0 -2.75 4
#> # ℹ 4 more variables: Q_25 <dbl>, Q_50 <dbl>, Q_75 <dbl>, max <dbl>
# discrete variables also work with group_by()
fmtcars |> group_by(cyl) |> descript(discrete=TRUE)
#> cyl: 4
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 1 0.0909
#> 2 1 10 0.909
#>
#> $am
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 automatic 3 0.273
#> 2 manual 8 0.727
#>
#> ------------------------------------------------------------
#> cyl: 6
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 3 0.429
#> 2 1 4 0.571
#>
#> $am
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 automatic 4 0.571
#> 2 manual 3 0.429
#>
#> ------------------------------------------------------------
#> cyl: 8
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 14 1
#> 2 1 0 0
#>
#> $am
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 automatic 12 0.857
#> 2 manual 2 0.143
#>
fmtcars |> group_by(am) |> descript(discrete=TRUE)
#> am: automatic
#> $cyl
#> # A tibble: 3 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 4 3 0.158
#> 2 6 4 0.211
#> 3 8 12 0.632
#>
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 12 0.632
#> 2 1 7 0.368
#>
#> ------------------------------------------------------------
#> am: manual
#> $cyl
#> # A tibble: 3 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 4 8 0.615
#> 2 6 3 0.231
#> 3 8 2 0.154
#>
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 6 0.462
#> 2 1 7 0.538
#>
fmtcars |> group_by(cyl, am) |> descript(discrete=TRUE)
#> cyl: 4
#> am: automatic
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 0 0
#> 2 1 3 1
#>
#> ------------------------------------------------------------
#> cyl: 6
#> am: automatic
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 0 0
#> 2 1 4 1
#>
#> ------------------------------------------------------------
#> cyl: 8
#> am: automatic
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 12 1
#> 2 1 0 0
#>
#> ------------------------------------------------------------
#> cyl: 4
#> am: manual
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 1 0.125
#> 2 1 7 0.875
#>
#> ------------------------------------------------------------
#> cyl: 6
#> am: manual
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 3 1
#> 2 1 0 0
#>
#> ------------------------------------------------------------
#> cyl: 8
#> am: manual
#> $vs
#> # A tibble: 2 × 3
#> values count proportion
#> <fct> <int> <dbl>
#> 1 0 2 1
#> 2 1 0 0
#>
# only return a subset of summary statistics
funs <- get_descriptFuns()
sfuns <- funs[c('n', 'miss', 'mean', 'sd')] # subset
fmtcars |> descript(funs=sfuns) # only n, miss, mean, and sd
#> # A tibble: 8 × 5
#> VARS n miss mean sd
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 32 NA 20.1 6.03
#> 2 disp 32 NA 231. 124.
#> 3 hp 32 NA 147. 68.6
#> 4 drat 32 NA 3.60 0.535
#> 5 wt 32 NA 3.22 0.978
#> 6 qsec 32 NA 17.8 1.79
#> 7 gear 32 NA 3.69 0.738
#> 8 carb 32 NA 2.81 1.62
# add a new functions
funs2 <- c(sfuns,
Q_5 = \(x) quantile(x, .05, na.rm=TRUE),
median= \(x) median(x, na.rm=TRUE),
Q_95 = \(x) quantile(x, .95, na.rm=TRUE))
fmtcars |> descript(funs=funs2)
#> # A tibble: 8 × 8
#> VARS n miss mean sd Q_5 median Q_95
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 mpg 32 NA 20.1 6.03 12.0 19.2 31.3
#> 2 disp 32 NA 231. 124. 77.4 196. 449
#> 3 hp 32 NA 147. 68.6 63.6 123 254.
#> 4 drat 32 NA 3.60 0.535 2.85 3.70 4.31
#> 5 wt 32 NA 3.22 0.978 1.74 3.32 5.29
#> 6 qsec 32 NA 17.8 1.79 15.0 17.7 20.1
#> 7 gear 32 NA 3.69 0.738 3 4 5
#> 8 carb 32 NA 2.81 1.62 1 2 4.9