The diagnose_outlier() produces outlier information for diagnosing the quality of the numerical data.

diagnose_outlier(.data, ...)

# S3 method for data.frame
diagnose_outlier(.data, ...)

# S3 method for grouped_df
diagnose_outlier(.data, ...)

Arguments

.data

a data.frame or a tbl_df or a grouped_df.

...

one or more unquoted expressions separated by commas. You can treat variable names like they are positions. Positive values select variables; negative values to drop variables. If the first expression is negative, diagnose_outlier() will automatically start with all variables. These arguments are automatically quoted and evaluated in a context where column names represent column positions. They support unquoting and splicing.

Value

an object of tbl_df.

Details

The scope of the diagnosis is the provide a outlier information. If the number of outliers is small and the difference between the averages including outliers and the averages not including them is large, it is necessary to eliminate or replace the outliers.

Outlier Diagnostic information

The information derived from the numerical data diagnosis is as follows.

  • variables : variable names

  • outliers_cnt : number of outliers

  • outliers_ratio : percent of outliers

  • outliers_mean : arithmetic average of outliers

  • with_mean : arithmetic average of with outliers

  • without_mean : arithmetic average of without outliers

See vignette("diagonosis") for an introduction to these concepts.

Examples

# \donttest{
# Diagnosis of numerical variables
diagnose_outlier(heartfailure)
#>           variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1               age            0      0.0000000           NaN     60.82943
#> 2        cpk_enzyme           29      9.6989967  2.905414e+03    581.83946
#> 3 ejection_fraction            2      0.6688963  7.500000e+01     38.08361
#> 4         platelets           21      7.0234114  3.783857e+05 263358.02926
#> 5        creatinine           29      9.6989967  3.793103e+00      1.39388
#> 6            sodium            4      1.3377926  1.185000e+02    136.62542
#> 7              time            0      0.0000000           NaN    130.26087
#>   without_mean
#> 1 6.082943e+01
#> 2 3.322704e+02
#> 3 3.783502e+01
#> 4 2.546689e+05
#> 5 1.136185e+00
#> 6 1.368712e+02
#> 7 1.302609e+02

# Select the variable to diagnose
diagnose_outlier(heartfailure, cpk_enzyme, sodium)
#>    variables outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#> 1 cpk_enzyme           29       9.698997      2905.414  581.8395     332.2704
#> 2     sodium            4       1.337793       118.500  136.6254     136.8712
diagnose_outlier(heartfailure, -cpk_enzyme, -sodium)
#>           variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1               age            0      0.0000000           NaN     60.82943
#> 2 ejection_fraction            2      0.6688963  7.500000e+01     38.08361
#> 3         platelets           21      7.0234114  3.783857e+05 263358.02926
#> 4        creatinine           29      9.6989967  3.793103e+00      1.39388
#> 5              time            0      0.0000000           NaN    130.26087
#>   without_mean
#> 1 6.082943e+01
#> 2 3.783502e+01
#> 3 2.546689e+05
#> 4 1.136185e+00
#> 5 1.302609e+02
diagnose_outlier(heartfailure, "cpk_enzyme", "sodium")
#>    variables outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#> 1 cpk_enzyme           29       9.698997      2905.414  581.8395     332.2704
#> 2     sodium            4       1.337793       118.500  136.6254     136.8712
diagnose_outlier(heartfailure, 5)
#> # A tibble: 1 × 6
#>   variables     outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#>   <chr>                <int>          <dbl>         <dbl>     <dbl>        <dbl>
#> 1 ejection_fra…            2          0.669            75      38.1         37.8

# Using pipes ---------------------------------
library(dplyr)

# Diagnosis of all numerical variables
heartfailure %>%
  diagnose_outlier()
#>           variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1               age            0      0.0000000           NaN     60.82943
#> 2        cpk_enzyme           29      9.6989967  2.905414e+03    581.83946
#> 3 ejection_fraction            2      0.6688963  7.500000e+01     38.08361
#> 4         platelets           21      7.0234114  3.783857e+05 263358.02926
#> 5        creatinine           29      9.6989967  3.793103e+00      1.39388
#> 6            sodium            4      1.3377926  1.185000e+02    136.62542
#> 7              time            0      0.0000000           NaN    130.26087
#>   without_mean
#> 1 6.082943e+01
#> 2 3.322704e+02
#> 3 3.783502e+01
#> 4 2.546689e+05
#> 5 1.136185e+00
#> 6 1.368712e+02
#> 7 1.302609e+02
# Positive values select variables
heartfailure %>%
  diagnose_outlier(cpk_enzyme, sodium)
#>    variables outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#> 1 cpk_enzyme           29       9.698997      2905.414  581.8395     332.2704
#> 2     sodium            4       1.337793       118.500  136.6254     136.8712
# Negative values to drop variables
heartfailure %>%
  diagnose_outlier(-cpk_enzyme, -sodium)
#>           variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1               age            0      0.0000000           NaN     60.82943
#> 2 ejection_fraction            2      0.6688963  7.500000e+01     38.08361
#> 3         platelets           21      7.0234114  3.783857e+05 263358.02926
#> 4        creatinine           29      9.6989967  3.793103e+00      1.39388
#> 5              time            0      0.0000000           NaN    130.26087
#>   without_mean
#> 1 6.082943e+01
#> 2 3.783502e+01
#> 3 2.546689e+05
#> 4 1.136185e+00
#> 5 1.302609e+02
# Positions values select variables
heartfailure %>%
  diagnose_outlier(5)
#> # A tibble: 1 × 6
#>   variables     outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#>   <chr>                <int>          <dbl>         <dbl>     <dbl>        <dbl>
#> 1 ejection_fra…            2          0.669            75      38.1         37.8
# Negative values to drop variables
heartfailure %>%
  diagnose_outlier(-1, -5)
#>    variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1 cpk_enzyme           29       9.698997  2.905414e+03    581.83946
#> 2  platelets           21       7.023411  3.783857e+05 263358.02926
#> 3 creatinine           29       9.698997  3.793103e+00      1.39388
#> 4     sodium            4       1.337793  1.185000e+02    136.62542
#> 5       time            0       0.000000           NaN    130.26087
#>   without_mean
#> 1 3.322704e+02
#> 2 2.546689e+05
#> 3 1.136185e+00
#> 4 1.368712e+02
#> 5 1.302609e+02

# Using pipes & dplyr -------------------------
# outlier_ratio is more than 1%
heartfailure %>%
  diagnose_outlier()  %>%
  filter(outliers_ratio > 1)
#>    variables outliers_cnt outliers_ratio outliers_mean    with_mean
#> 1 cpk_enzyme           29       9.698997  2.905414e+03    581.83946
#> 2  platelets           21       7.023411  3.783857e+05 263358.02926
#> 3 creatinine           29       9.698997  3.793103e+00      1.39388
#> 4     sodium            4       1.337793  1.185000e+02    136.62542
#>   without_mean
#> 1 3.322704e+02
#> 2 2.546689e+05
#> 3 1.136185e+00
#> 4 1.368712e+02
  
# Using group_by ------------------------------
# Calculate the diagnosis of all variables by 'death_event' using group_by()
heartfailure %>%
  group_by(death_event) %>% 
  diagnose_outlier() 
#> # A tibble: 14 × 8
#>    variables      death_event data_cnt outliers_cnt outliers_ratio outliers_mean
#>    <chr>          <fct>          <int>        <int>          <dbl>         <dbl>
#>  1 age            No               203            1          0.493         90   
#>  2 age            Yes               96            0          0            NaN   
#>  3 cpk_enzyme     No               203           20          9.85        2439.  
#>  4 cpk_enzyme     Yes               96            9          9.38        3942   
#>  5 ejection_frac… No               203            3          1.48          53   
#>  6 ejection_frac… Yes               96            7          7.29          62.4 
#>  7 platelets      No               203           13          6.40      433469.  
#>  8 platelets      Yes               96            2          2.08      559000   
#>  9 creatinine     No               203           21         10.3            2.64
#> 10 creatinine     Yes               96            8          8.33           5.82
#> 11 sodium         No               203            5          2.46         128   
#> 12 sodium         Yes               96            3          3.12         120.  
#> 13 time           No               203            0          0            NaN   
#> 14 time           Yes               96            2          2.08         238   
#> # ℹ 2 more variables: with_mean <dbl>, without_mean <dbl>
# }