The describe() compute descriptive statistic of numeric variable for exploratory data analysis.
describe(.data, ...)
# S3 method for data.frame
describe(.data, ..., statistics = NULL, quantiles = NULL)
# S3 method for grouped_df
describe(
.data,
...,
statistics = NULL,
quantiles = NULL,
all.combinations = FALSE
)
a data.frame or a tbl_df
or a grouped_df
.
one or more unquoted expressions separated by commas. You can treat variable names like they are positions. Positive values select variables; negative values to drop variables. If the first expression is negative, describe() will automatically start with all variables. These arguments are automatically quoted and evaluated in a context where column names represent column positions. They support unquoting and splicing.
See vignette("EDA") for an introduction to these concepts.
character. the name of the descriptive statistic to calculate. The defaults is c("mean", "sd", "se_mean", "IQR", "skewness", "kurtosis", "quantiles")
numeric. list of quantiles to calculate. The values of elements must be between 0 and 1. and to calculate quantiles, you must include "quantiles" in the statistics argument value. The default is c(0, .01, .05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99, 1).
logical. When used with group_by(), this argument expresses all combinations of group combinations. If the argument value is TRUE, cases that do not exist as actual data are also included in the output.
An object of the same class as .data.
This function is useful when used with the group_by
function
of the dplyr package.
If you want to calculate the statistic by level of the categorical data
you are interested in, rather than the whole statistic, you can use
grouped_df as the group_by() function.
From version 0.5.5, the 'variable' column in the "descriptive statistic information" tibble object has been changed to 'described_variables'. This is because there are cases where 'variable' is included in the variable name of the data. There is probably no case where 'described_variables' is included in the variable name of the data.
The information derived from the numerical data describe is as follows.
n : number of observations excluding missing values
na : number of missing values
mean : arithmetic average
sd : standard deviation
se_mean : standard error mean. sd/sqrt(n)
IQR : interquartile range (Q3-Q1)
skewness : skewness
kurtosis : kurtosis
p25 : Q1. 25% percentile
p50 : median. 50% percentile
p75 : Q3. 75% percentile
p01, p05, p10, p20, p30 : 1%, 5%, 20%, 30% percentiles
p40, p60, p70, p80 : 40%, 60%, 70%, 80% percentiles
p90, p95, p99, p100 : 90%, 95%, 99%, 100% percentiles
# \donttest{
# Generate data for the example
heartfailure2 <- heartfailure
heartfailure2[sample(seq(NROW(heartfailure2)), 20), "sodium"] <- NA
heartfailure2[sample(seq(NROW(heartfailure2)), 5), "smoking"] <- NA
# Describe descriptive statistics of numerical variables
describe(heartfailure2)
#> # A tibble: 7 × 26
#> described_variables n na mean sd se_mean IQR skewness
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 age 299 0 60.8 11.9 0.688 19 0.424
#> 2 cpk_enzyme 299 0 582. 970. 56.1 466. 4.46
#> 3 ejection_fraction 299 0 38.1 11.8 0.684 15 0.555
#> 4 platelets 299 0 263358. 97804. 5656. 91000 1.46
#> 5 creatinine 299 0 1.39 1.03 0.0598 0.5 4.46
#> 6 sodium 279 20 137. 4.46 0.267 5.5 -1.06
#> 7 time 299 0 130. 77.6 4.49 130 0.128
#> # ℹ 18 more variables: kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>,
#> # p10 <dbl>, p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>,
#> # p60 <dbl>, p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>,
#> # p99 <dbl>, p100 <dbl>
# Select the variable to describe
describe(heartfailure2, sodium, platelets, statistics = c("mean", "sd", "quantiles"))
#> # A tibble: 2 × 22
#> described_variables n na mean sd p00 p01 p05 p10
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 sodium 279 20 137. 4.46 113 123. 130 132
#> 2 platelets 299 0 263358. 97804. 25100 61780 131800 153000
#> # ℹ 13 more variables: p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>,
#> # p60 <dbl>, p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>,
#> # p99 <dbl>, p100 <dbl>
describe(heartfailure2, -sodium, -platelets)
#> # A tibble: 5 × 26
#> described_variables n na mean sd se_mean IQR skewness kurtosis
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 age 299 0 60.8 11.9 0.688 19 0.424 -0.184
#> 2 cpk_enzyme 299 0 582. 970. 56.1 466. 4.46 25.1
#> 3 ejection_fraction 299 0 38.1 11.8 0.684 15 0.555 0.0414
#> 4 creatinine 299 0 1.39 1.03 0.0598 0.5 4.46 25.8
#> 5 time 299 0 130. 77.6 4.49 130 0.128 -1.21
#> # ℹ 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
describe(heartfailure2, 5, statistics = c("mean", "sd", "quantiles"), quantiles = c(0.01, 0.1))
#> # A tibble: 1 × 7
#> described_variables n na mean sd p01 p10
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 ejection_fraction 299 0 38.1 11.8 17.0 25
# Using dplyr::grouped_dt
library(dplyr)
gdata <- group_by(heartfailure2, hblood_pressure, death_event)
describe(gdata, "creatinine")
#> # A tibble: 4 × 28
#> described_variables hblood_pressure death_event n na mean sd
#> <chr> <fct> <fct> <int> <int> <dbl> <dbl>
#> 1 creatinine No No 137 0 1.25 0.735
#> 2 creatinine No Yes 57 0 1.76 1.12
#> 3 creatinine Yes No 66 0 1.05 0.415
#> 4 creatinine Yes Yes 39 0 1.95 1.88
#> # ℹ 21 more variables: se_mean <dbl>, IQR <dbl>, skewness <dbl>,
#> # kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# Using pipes ---------------------------------
# Positive values select variables
heartfailure2 %>%
describe(platelets, sodium, creatinine)
#> # A tibble: 3 × 26
#> described_variables n na mean sd se_mean IQR skewness kurtosis
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 platelets 299 0 2.63e5 9.78e4 5.66e+3 9.1e+4 1.46 6.21
#> 2 sodium 279 20 1.37e2 4.46e0 2.67e-1 5.5e+0 -1.06 4.09
#> 3 creatinine 299 0 1.39e0 1.03e0 5.98e-2 5 e-1 4.46 25.8
#> # ℹ 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# Negative values to drop variables
heartfailure2 %>%
describe(-platelets, -sodium, -creatinine)
#> # A tibble: 4 × 26
#> described_variables n na mean sd se_mean IQR skewness kurtosis
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 age 299 0 60.8 11.9 0.688 19 0.424 -0.184
#> 2 cpk_enzyme 299 0 582. 970. 56.1 466. 4.46 25.1
#> 3 ejection_fraction 299 0 38.1 11.8 0.684 15 0.555 0.0414
#> 4 time 299 0 130. 77.6 4.49 130 0.128 -1.21
#> # ℹ 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# Using pipes & dplyr -------------------------
# Find the statistic of all numerical variables by 'hblood_pressure' and 'death_event',
# and extract only those with 'hblood_pressure' variable level is "Yes".
heartfailure2 %>%
group_by(hblood_pressure, death_event) %>%
describe() %>%
filter(hblood_pressure == "Yes")
#> # A tibble: 14 × 28
#> described_variables hblood_pressure death_event n na mean sd
#> <chr> <fct> <fct> <int> <int> <dbl> <dbl>
#> 1 age Yes No 66 0 60.1 9.42e+0
#> 2 age Yes Yes 39 0 66.1 1.37e+1
#> 3 cpk_enzyme Yes No 66 0 445. 5.89e+2
#> 4 cpk_enzyme Yes Yes 39 0 563. 1.21e+3
#> 5 creatinine Yes No 66 0 1.05 4.15e-1
#> 6 creatinine Yes Yes 39 0 1.95 1.88e+0
#> 7 ejection_fraction Yes No 66 0 41.7 1.20e+1
#> 8 ejection_fraction Yes Yes 39 0 33.1 1.14e+1
#> 9 platelets Yes No 66 0 267512. 7.77e+4
#> 10 platelets Yes Yes 39 0 274181. 9.61e+4
#> 11 sodium Yes No 60 6 137. 4.11e+0
#> 12 sodium Yes Yes 38 1 136. 4.08e+0
#> 13 time Yes No 66 0 141. 6.17e+1
#> 14 time Yes Yes 39 0 57.1 5.31e+1
#> # ℹ 21 more variables: se_mean <dbl>, IQR <dbl>, skewness <dbl>,
#> # kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# Using all.combinations = TRUE
heartfailure2 %>%
filter(!hblood_pressure %in% "Yes" | !death_event %in% "Yes") %>%
group_by(hblood_pressure, death_event) %>%
describe(all.combinations = TRUE)
#> # A tibble: 28 × 28
#> described_variables hblood_pressure death_event n na mean sd
#> <chr> <fct> <fct> <dbl> <int> <dbl> <dbl>
#> 1 age No No 137 0 58.1 11.2
#> 2 age No Yes 57 0 64.6 12.9
#> 3 age Yes No 66 0 60.1 9.42
#> 4 age Yes Yes 0 NA NA NA
#> 5 cpk_enzyme No No 137 0 586. 820.
#> 6 cpk_enzyme No Yes 57 0 743. 1390.
#> 7 cpk_enzyme Yes No 66 0 445. 589.
#> 8 cpk_enzyme Yes Yes 0 NA NA NA
#> 9 creatinine No No 137 0 1.25 0.735
#> 10 creatinine No Yes 57 0 1.76 1.12
#> # ℹ 18 more rows
#> # ℹ 21 more variables: se_mean <dbl>, IQR <dbl>, skewness <dbl>,
#> # kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# extract only those with 'smoking' variable level is "Yes",
# and find 'creatinine' statistics by 'hblood_pressure' and 'death_event'
heartfailure2 %>%
filter(smoking == "Yes") %>%
group_by(hblood_pressure, death_event) %>%
describe(creatinine)
#> # A tibble: 4 × 28
#> described_variables hblood_pressure death_event n na mean sd
#> <chr> <fct> <fct> <int> <int> <dbl> <dbl>
#> 1 creatinine No No 50 0 1.18 0.557
#> 2 creatinine No Yes 15 0 1.44 0.584
#> 3 creatinine Yes No 16 0 1.16 0.683
#> 4 creatinine Yes Yes 14 0 2.09 2.26
#> # ℹ 21 more variables: se_mean <dbl>, IQR <dbl>, skewness <dbl>,
#> # kurtosis <dbl>, p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
#> # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
#> # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# }