The binning() converts a numeric variable to a categorization variable.
binning(
x,
nbins,
type = c("quantile", "equal", "pretty", "kmeans", "bclust"),
ordered = TRUE,
labels = NULL,
approxy.lab = TRUE
)
numeric. numeric vector for binning.
integer. number of intervals(bins). required. if missing, nclass.Sturges is used.
character. binning method. Choose from "quantile", "equal", "pretty", "kmeans" and "bclust". The "quantile" sets breaks with quantiles of the same interval. The "equal" sets breaks at the same interval. The "pretty" chooses a number of breaks not necessarily equal to nbins using base::pretty function. The "kmeans" uses stats::kmeans function to generate the breaks. The "bclust" uses e1071::bclust function to generate the breaks using bagged clustering. "kmeans" and "bclust" was implemented by classInt::classIntervals() function.
logical. whether to build an ordered factor or not.
character. the label names to use for each of the bins.
logical. If TRUE, large number breaks are approximated to pretty numbers. If FALSE, the original breaks obtained by type are used.
An object of bins class. Attributes of bins class is as follows.
class : "bins"
type : binning type, "quantile", "equal", "pretty", "kmeans", "bclust".
breaks : breaks for binning. the number of intervals into which x is to be cut.
levels : levels of binned value.
raw : raw data, numeric vector corresponding to x argument.
This function is useful when used with the mutate/transmute function of the dplyr package.
See vignette("transformation") for an introduction to these concepts.
# \donttest{
# Generate data for the example
heartfailure2 <- heartfailure
heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA
# Binning the platelets variable. default type argument is "quantile"
bin <- binning(heartfailure2$platelets)
# Print bins class object
bin
#> binned type: quantile
#> number of bins: 10
#> x
<<<<<<< HEAD
#> [25100,153000] (153000,196400] (196400,221000] (221000,237000]
#> 30 26 31 27
#> (237000,262000] (262000,265933.3] (265933.3,289866.7] (289866.7,323400]
#> 26 27 28 28
#> (323400,376200] (376200,850000] <NA>
#> 28 28 20
=======
#> [25100,151533.3] (151533.3,194000] (194000,220000] (220000,235000]
#> 28 29 28 28
#> (235000,255000] (255000,263358] (263358,281866.7] (281866.7,319000]
#> 27 5 50 29
#> (319000,370200] (370200,850000] <NA>
#> 27 28 20
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
# Using labels argument
bin <- binning(heartfailure2$platelets, nbins = 4,
labels = c("LQ1", "UQ1", "LQ3", "UQ3"))
bin
#> binned type: quantile
#> number of bins: 4
#> x
#> LQ1 UQ1 LQ3 UQ3 <NA>
#> 70 70 70 69 20
# Using another type argument
bin <- binning(heartfailure2$platelets, nbins = 5, type = "equal")
bin
#> binned type: equal
#> number of bins: 5
#> x
#> [25100,190080] (190080,355060] (355060,520040] (520040,685020] (685020,850000]
<<<<<<< HEAD
#> 50 191 34 2 2
=======
#> 53 189 32 3 2
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#> <NA>
#> 20
bin <- binning(heartfailure2$platelets, nbins = 5, type = "pretty")
bin
#> binned type: pretty
#> number of bins: 5
#> x
#> [0,2e+05] (2e+05,4e+05] (4e+05,6e+05] (6e+05,8e+05] (8e+05,1e+06]
<<<<<<< HEAD
#> 58 202 17 1 1
=======
#> 60 202 14 2 1
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#> <NA>
#> 20
# "kmeans" and "bclust" was implemented by classInt::classIntervals() function.
# So, you must install classInt package.
if (requireNamespace("classInt", quietly = TRUE)) {
bin <- binning(heartfailure2$platelets, nbins = 5, type = "kmeans")
bin
bin <- binning(heartfailure2$platelets, nbins = 5, type = "bclust")
bin
} else {
cat("If you want to use this feature, you need to install the 'classInt' package.\n")
}
#> binned type: bclust
#> number of bins: 5
#> x
<<<<<<< HEAD
#> [25100,112000] (112000,241500] (241500,437500] (437500,642500] (642500,850000]
#> 9 106 151 11 2
=======
#> [25100,113500] (113500,350500] (350500,582000] (582000,796000] (796000,850000]
#> 9 231 36 2 1
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#> <NA>
#> 20
x <- sample(1:1000, size = 50) * 12345679
bin <- binning(x)
bin
#> binned type: quantile
#> number of bins: 7
#> x
<<<<<<< HEAD
#> [1.728395e+08,1.930041e+09] (1.930041e+09,3.915344e+09]
#> 7 7
#> (3.915344e+09,5.285714e+09] (5.285714e+09,6.550265e+09]
#> 7 8
#> (6.550265e+09,8.548501e+09] (8.548501e+09,1.044386e+10]
#> 7 7
#> (1.044386e+10,1.207407e+10]
=======
#> [1.234568e+07,1.86067e+09] (1.86067e+09,3.483245e+09]
#> 7 7
#> (3.483245e+09,4.769547e+09] (4.769547e+09,5.930629e+09]
#> 7 8
#> (5.930629e+09,8.728395e+09] (8.728395e+09,1.052675e+10]
#> 7 7
#> (1.052675e+10,1.217284e+10]
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#> 7
bin <- binning(x, approxy.lab = FALSE)
bin
#> binned type: quantile
#> number of bins: 7
#> x
<<<<<<< HEAD
#> [172839506,1930041150] (1930041150,3915343911] (3915343911,5285714280]
#> 7 7 7
#> (5285714280,6550264544] (6550264544,8548500873] (8548500873,10443856545]
#> 8 7 7
#> (10443856545,12074074062]
=======
#> [12345679,1860670192] (1860670192,3483245146] (3483245146,4769547320]
#> 7 7 7
#> (4769547320,5930629036] (5930629036,8728395053] (8728395053,10526748961]
#> 8 7 7
#> (10526748961,12172839494]
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#> 7
# extract binned results
extract(bin)
<<<<<<< HEAD
#> [1] (3915343911,5285714280] (1930041150,3915343911]
#> [3] [172839506,1930041150] (5285714280,6550264544]
#> [5] (5285714280,6550264544] (1930041150,3915343911]
#> [7] (8548500873,10443856545] [172839506,1930041150]
#> [9] (8548500873,10443856545] (3915343911,5285714280]
#> [11] [172839506,1930041150] (5285714280,6550264544]
#> [13] (8548500873,10443856545] (1930041150,3915343911]
#> [15] (3915343911,5285714280] (8548500873,10443856545]
#> [17] [172839506,1930041150] (3915343911,5285714280]
#> [19] (10443856545,12074074062] [172839506,1930041150]
#> [21] (6550264544,8548500873] (8548500873,10443856545]
#> [23] (6550264544,8548500873] (6550264544,8548500873]
#> [25] (1930041150,3915343911] [172839506,1930041150]
#> [27] (5285714280,6550264544] (1930041150,3915343911]
#> [29] (5285714280,6550264544] (10443856545,12074074062]
#> [31] (10443856545,12074074062] (5285714280,6550264544]
#> [33] (5285714280,6550264544] (3915343911,5285714280]
#> [35] (8548500873,10443856545] (10443856545,12074074062]
#> [37] (10443856545,12074074062] (6550264544,8548500873]
#> [39] (8548500873,10443856545] (5285714280,6550264544]
#> [41] (10443856545,12074074062] (1930041150,3915343911]
#> [43] (6550264544,8548500873] (3915343911,5285714280]
#> [45] [172839506,1930041150] (3915343911,5285714280]
#> [47] (6550264544,8548500873] (10443856545,12074074062]
#> [49] (1930041150,3915343911] (6550264544,8548500873]
#> 7 Levels: [172839506,1930041150] < ... < (10443856545,12074074062]
=======
#> [1] (1860670192,3483245146] [12345679,1860670192]
#> [3] (10526748961,12172839494] (4769547320,5930629036]
#> [5] (4769547320,5930629036] (3483245146,4769547320]
#> [7] (1860670192,3483245146] (1860670192,3483245146]
#> [9] [12345679,1860670192] [12345679,1860670192]
#> [11] (10526748961,12172839494] [12345679,1860670192]
#> [13] (4769547320,5930629036] (4769547320,5930629036]
#> [15] (3483245146,4769547320] (3483245146,4769547320]
#> [17] (1860670192,3483245146] (3483245146,4769547320]
#> [19] (10526748961,12172839494] (5930629036,8728395053]
#> [21] [12345679,1860670192] (10526748961,12172839494]
#> [23] (3483245146,4769547320] (10526748961,12172839494]
#> [25] (8728395053,10526748961] (10526748961,12172839494]
#> [27] (4769547320,5930629036] (8728395053,10526748961]
#> [29] (1860670192,3483245146] (1860670192,3483245146]
#> [31] (8728395053,10526748961] (4769547320,5930629036]
#> [33] (4769547320,5930629036] (5930629036,8728395053]
#> [35] (8728395053,10526748961] (8728395053,10526748961]
#> [37] (5930629036,8728395053] (5930629036,8728395053]
#> [39] (5930629036,8728395053] (5930629036,8728395053]
#> [41] [12345679,1860670192] (1860670192,3483245146]
#> [43] [12345679,1860670192] (10526748961,12172839494]
#> [45] (3483245146,4769547320] (8728395053,10526748961]
#> [47] (4769547320,5930629036] (3483245146,4769547320]
#> [49] (5930629036,8728395053] (8728395053,10526748961]
#> 7 Levels: [12345679,1860670192] < ... < (10526748961,12172839494]
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
# -------------------------
# Using pipes & dplyr
# -------------------------
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
# Compare binned frequency by death_event
heartfailure2 %>%
mutate(platelets_bin = binning(heartfailure2$platelets) %>%
extract()) %>%
group_by(death_event, platelets_bin) %>%
summarise(freq = n(), .groups = "drop") %>%
arrange(desc(freq)) %>%
head(10)
#> # A tibble: 10 × 3
<<<<<<< HEAD
#> death_event platelets_bin freq
#> <fct> <ord> <int>
#> 1 No (265933.3,289866.7] 22
#> 2 No (221000,237000] 21
#> 3 No (289866.7,323400] 21
#> 4 No (196400,221000] 20
#> 5 No (237000,262000] 20
#> 6 No (323400,376200] 19
#> 7 No (376200,850000] 18
#> 8 No (153000,196400] 17
#> 9 No (262000,265933.3] 16
#> 10 No [25100,153000] 15
=======
#> death_event platelets_bin freq
#> <fct> <ord> <int>
#> 1 No (263358,281866.7] 35
#> 2 No (220000,235000] 21
#> 3 No (235000,255000] 20
#> 4 No (281866.7,319000] 20
#> 5 No (151533.3,194000] 19
#> 6 No (370200,850000] 19
#> 7 No [25100,151533.3] 17
#> 8 No (194000,220000] 17
#> 9 No (319000,370200] 17
#> 10 Yes (263358,281866.7] 15
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
# }