Binning the Numeric Data

The binning() converts a numeric variable to a categorization variable.

binning(
  x,
  nbins,
  type = c("quantile", "equal", "pretty", "kmeans", "bclust"),
  ordered = TRUE,
  labels = NULL,
  approxy.lab = TRUE
)

Arguments

x: numeric. numeric vector for binning.
nbins: integer. number of intervals(bins). required. if missing, nclass.Sturges is used.
type: character. binning method. Choose from "quantile", "equal", "pretty", "kmeans" and "bclust". The "quantile" sets breaks with quantiles of the same interval. The "equal" sets breaks at the same interval. The "pretty" chooses a number of breaks not necessarily equal to nbins using base::pretty function. The "kmeans" uses stats::kmeans function to generate the breaks. The "bclust" uses e1071::bclust function to generate the breaks using bagged clustering. "kmeans" and "bclust" was implemented by classInt::classIntervals() function.
ordered: logical. whether to build an ordered factor or not.
labels: character. the label names to use for each of the bins.
approxy.lab: logical. If TRUE, large number breaks are approximated to pretty numbers. If FALSE, the original breaks obtained by type are used.

Value

An object of bins class. Attributes of bins class is as follows.

class : "bins"
type : binning type, "quantile", "equal", "pretty", "kmeans", "bclust".
breaks : breaks for binning. the number of intervals into which x is to be cut.
levels : levels of binned value.
raw : raw data, numeric vector corresponding to x argument.

Details

This function is useful when used with the mutate/transmute function of the dplyr package.

See vignette("transformation") for an introduction to these concepts.

Examples

# \donttest{
# Generate data for the example
heartfailure2 <- heartfailure
heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA

# Binning the platelets variable. default type argument is "quantile"
bin <- binning(heartfailure2$platelets)
# Print bins class object
bin
#> binned type: quantile
#> number of bins: 10
#> x
<<<<<<< HEAD
#>      [25100,153000]     (153000,196400]     (196400,221000]     (221000,237000] 
#>                  30                  26                  31                  27 
#>     (237000,262000]   (262000,265933.3] (265933.3,289866.7]   (289866.7,323400] 
#>                  26                  27                  28                  28 
#>     (323400,376200]     (376200,850000]                <NA> 
#>                  28                  28                  20 
=======
#>  [25100,151533.3] (151533.3,194000]   (194000,220000]   (220000,235000] 
#>                28                29                28                28 
#>   (235000,255000]   (255000,263358] (263358,281866.7] (281866.7,319000] 
#>                27                 5                50                29 
#>   (319000,370200]   (370200,850000]              <NA> 
#>                27                28                20 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b

# Using labels argument
bin <- binning(heartfailure2$platelets, nbins = 4,
              labels = c("LQ1", "UQ1", "LQ3", "UQ3"))
bin
#> binned type: quantile
#> number of bins: 4
#> x
#>  LQ1  UQ1  LQ3  UQ3 <NA> 
#>   70   70   70   69   20 

# Using another type argument
bin <- binning(heartfailure2$platelets, nbins = 5, type = "equal")
bin
#> binned type: equal
#> number of bins: 5
#> x
#>  [25100,190080] (190080,355060] (355060,520040] (520040,685020] (685020,850000] 
<<<<<<< HEAD
#>              50             191              34               2               2 
=======
#>              53             189              32               3               2 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#>            <NA> 
#>              20 
bin <- binning(heartfailure2$platelets, nbins = 5, type = "pretty")
bin
#> binned type: pretty
#> number of bins: 5
#> x
#>     [0,2e+05] (2e+05,4e+05] (4e+05,6e+05] (6e+05,8e+05] (8e+05,1e+06] 
<<<<<<< HEAD
#>            58           202            17             1             1 
=======
#>            60           202            14             2             1 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#>          <NA> 
#>            20 
# "kmeans" and "bclust" was implemented by classInt::classIntervals() function.
# So, you must install classInt package.
if (requireNamespace("classInt", quietly = TRUE)) {
  bin <- binning(heartfailure2$platelets, nbins = 5, type = "kmeans")
  bin
  bin <- binning(heartfailure2$platelets, nbins = 5, type = "bclust")
  bin
} else {
  cat("If you want to use this feature, you need to install the 'classInt' package.\n")
}
#> binned type: bclust
#> number of bins: 5
#> x
<<<<<<< HEAD
#>  [25100,112000] (112000,241500] (241500,437500] (437500,642500] (642500,850000] 
#>               9             106             151              11               2 
=======
#>  [25100,113500] (113500,350500] (350500,582000] (582000,796000] (796000,850000] 
#>               9             231              36               2               1 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#>            <NA> 
#>              20 

x <- sample(1:1000, size = 50) * 12345679
bin <- binning(x)
bin
#> binned type: quantile
#> number of bins: 7
#> x
<<<<<<< HEAD
#> [1.728395e+08,1.930041e+09] (1.930041e+09,3.915344e+09] 
#>                           7                           7 
#> (3.915344e+09,5.285714e+09] (5.285714e+09,6.550265e+09] 
#>                           7                           8 
#> (6.550265e+09,8.548501e+09] (8.548501e+09,1.044386e+10] 
#>                           7                           7 
#> (1.044386e+10,1.207407e+10] 
=======
#>  [1.234568e+07,1.86067e+09]  (1.86067e+09,3.483245e+09] 
#>                           7                           7 
#> (3.483245e+09,4.769547e+09] (4.769547e+09,5.930629e+09] 
#>                           7                           8 
#> (5.930629e+09,8.728395e+09] (8.728395e+09,1.052675e+10] 
#>                           7                           7 
#> (1.052675e+10,1.217284e+10] 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#>                           7 
bin <- binning(x, approxy.lab = FALSE)
bin
#> binned type: quantile
#> number of bins: 7
#> x
<<<<<<< HEAD
#>    [172839506,1930041150]   (1930041150,3915343911]   (3915343911,5285714280] 
#>                         7                         7                         7 
#>   (5285714280,6550264544]   (6550264544,8548500873]  (8548500873,10443856545] 
#>                         8                         7                         7 
#> (10443856545,12074074062] 
=======
#>     [12345679,1860670192]   (1860670192,3483245146]   (3483245146,4769547320] 
#>                         7                         7                         7 
#>   (4769547320,5930629036]   (5930629036,8728395053]  (8728395053,10526748961] 
#>                         8                         7                         7 
#> (10526748961,12172839494] 
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
#>                         7 

# extract binned results
extract(bin)
<<<<<<< HEAD
#>  [1] (3915343911,5285714280]   (1930041150,3915343911]  
#>  [3] [172839506,1930041150]    (5285714280,6550264544]  
#>  [5] (5285714280,6550264544]   (1930041150,3915343911]  
#>  [7] (8548500873,10443856545]  [172839506,1930041150]   
#>  [9] (8548500873,10443856545]  (3915343911,5285714280]  
#> [11] [172839506,1930041150]    (5285714280,6550264544]  
#> [13] (8548500873,10443856545]  (1930041150,3915343911]  
#> [15] (3915343911,5285714280]   (8548500873,10443856545] 
#> [17] [172839506,1930041150]    (3915343911,5285714280]  
#> [19] (10443856545,12074074062] [172839506,1930041150]   
#> [21] (6550264544,8548500873]   (8548500873,10443856545] 
#> [23] (6550264544,8548500873]   (6550264544,8548500873]  
#> [25] (1930041150,3915343911]   [172839506,1930041150]   
#> [27] (5285714280,6550264544]   (1930041150,3915343911]  
#> [29] (5285714280,6550264544]   (10443856545,12074074062]
#> [31] (10443856545,12074074062] (5285714280,6550264544]  
#> [33] (5285714280,6550264544]   (3915343911,5285714280]  
#> [35] (8548500873,10443856545]  (10443856545,12074074062]
#> [37] (10443856545,12074074062] (6550264544,8548500873]  
#> [39] (8548500873,10443856545]  (5285714280,6550264544]  
#> [41] (10443856545,12074074062] (1930041150,3915343911]  
#> [43] (6550264544,8548500873]   (3915343911,5285714280]  
#> [45] [172839506,1930041150]    (3915343911,5285714280]  
#> [47] (6550264544,8548500873]   (10443856545,12074074062]
#> [49] (1930041150,3915343911]   (6550264544,8548500873]  
#> 7 Levels: [172839506,1930041150] < ... < (10443856545,12074074062]
=======
#>  [1] (1860670192,3483245146]   [12345679,1860670192]    
#>  [3] (10526748961,12172839494] (4769547320,5930629036]  
#>  [5] (4769547320,5930629036]   (3483245146,4769547320]  
#>  [7] (1860670192,3483245146]   (1860670192,3483245146]  
#>  [9] [12345679,1860670192]     [12345679,1860670192]    
#> [11] (10526748961,12172839494] [12345679,1860670192]    
#> [13] (4769547320,5930629036]   (4769547320,5930629036]  
#> [15] (3483245146,4769547320]   (3483245146,4769547320]  
#> [17] (1860670192,3483245146]   (3483245146,4769547320]  
#> [19] (10526748961,12172839494] (5930629036,8728395053]  
#> [21] [12345679,1860670192]     (10526748961,12172839494]
#> [23] (3483245146,4769547320]   (10526748961,12172839494]
#> [25] (8728395053,10526748961]  (10526748961,12172839494]
#> [27] (4769547320,5930629036]   (8728395053,10526748961] 
#> [29] (1860670192,3483245146]   (1860670192,3483245146]  
#> [31] (8728395053,10526748961]  (4769547320,5930629036]  
#> [33] (4769547320,5930629036]   (5930629036,8728395053]  
#> [35] (8728395053,10526748961]  (8728395053,10526748961] 
#> [37] (5930629036,8728395053]   (5930629036,8728395053]  
#> [39] (5930629036,8728395053]   (5930629036,8728395053]  
#> [41] [12345679,1860670192]     (1860670192,3483245146]  
#> [43] [12345679,1860670192]     (10526748961,12172839494]
#> [45] (3483245146,4769547320]   (8728395053,10526748961] 
#> [47] (4769547320,5930629036]   (3483245146,4769547320]  
#> [49] (5930629036,8728395053]   (8728395053,10526748961] 
#> 7 Levels: [12345679,1860670192] < ... < (10526748961,12172839494]
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b

# -------------------------
# Using pipes & dplyr
# -------------------------
library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union

# Compare binned frequency by death_event
heartfailure2 %>%
  mutate(platelets_bin = binning(heartfailure2$platelets) %>% 
           extract()) %>%
  group_by(death_event, platelets_bin) %>%
  summarise(freq = n(), .groups = "drop") %>%
  arrange(desc(freq)) %>%
  head(10)
#> # A tibble: 10 × 3
<<<<<<< HEAD
#>    death_event platelets_bin        freq
#>    <fct>       <ord>               <int>
#>  1 No          (265933.3,289866.7]    22
#>  2 No          (221000,237000]        21
#>  3 No          (289866.7,323400]      21
#>  4 No          (196400,221000]        20
#>  5 No          (237000,262000]        20
#>  6 No          (323400,376200]        19
#>  7 No          (376200,850000]        18
#>  8 No          (153000,196400]        17
#>  9 No          (262000,265933.3]      16
#> 10 No          [25100,153000]         15
=======
#>    death_event platelets_bin      freq
#>    <fct>       <ord>             <int>
#>  1 No          (263358,281866.7]    35
#>  2 No          (220000,235000]      21
#>  3 No          (235000,255000]      20
#>  4 No          (281866.7,319000]    20
#>  5 No          (151533.3,194000]    19
#>  6 No          (370200,850000]      19
#>  7 No          [25100,151533.3]     17
#>  8 No          (194000,220000]      17
#>  9 No          (319000,370200]      17
#> 10 Yes         (263358,281866.7]    15
>>>>>>> 2455413f029244b566a37aeed1916eea79ac483b
# }

Arguments

Value

Details

See also

Examples