The treatment_corr() diagnose pairs of highly correlated variables or remove on of them.
treatment_corr(.data, corr_thres = 0.8, treat = TRUE, verbose = TRUE)
a data.frame or a tbl_df
.
numeric. Set a threshold to detecting variables when correlation greater then threshold.
logical. Set whether to removing variables
logical. Set whether to echo information to the console at runtime.
An object of data.frame or train_df. and return value is an object of the same type as the .data argument. However, several variables can be excluded by correlation between variables.
The correlation coefficient of pearson is obtained for continuous variables and the correlation coefficient of spearman for categorical variables.
# numerical variable
x1 <- 1:100
set.seed(12L)
x2 <- sample(1:3, size = 100, replace = TRUE) * x1 + rnorm(1)
set.seed(1234L)
x3 <- sample(1:2, size = 100, replace = TRUE) * x1 + rnorm(1)
# categorical variable
x4 <- factor(rep(letters[1:20], time = 5))
set.seed(100L)
x5 <- factor(rep(letters[1:20 + sample(1:6, size = 20, replace = TRUE)], time = 5))
set.seed(200L)
x6 <- factor(rep(letters[1:20 + sample(1:3, size = 20, replace = TRUE)], time = 5))
set.seed(300L)
x7 <- factor(sample(letters[1:5], size = 100, replace = TRUE))
exam <- data.frame(x1, x2, x3, x4, x5, x6, x7)
str(exam)
#> 'data.frame': 100 obs. of 7 variables:
#> $ x1: int 1 2 3 4 5 6 7 8 9 10 ...
#> $ x2: num 2.55 4.55 9.55 12.55 10.55 ...
#> $ x3: num 0.194 2.194 4.194 6.194 3.194 ...
#> $ x4: Factor w/ 20 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...
#> $ x5: Factor w/ 13 levels "c","e","f","g",..: 1 5 3 2 4 7 6 8 9 8 ...
#> $ x6: Factor w/ 15 levels "c","d","f","g",..: 1 2 3 4 3 5 6 7 8 9 ...
#> $ x7: Factor w/ 5 levels "a","b","c","d",..: 2 2 1 4 5 1 4 3 1 5 ...
head(exam)
#> x1 x2 x3 x4 x5 x6 x7
#> 1 1 2.554297 0.1939687 a c c b
#> 2 2 4.554297 2.1939687 b h d b
#> 3 3 9.554297 4.1939687 c f f a
#> 4 4 12.554297 6.1939687 d e g d
#> 5 5 10.554297 3.1939687 e g f e
#> 6 6 6.554297 10.1939687 f l h a
# default case
treatment_corr(exam)
#> * remove variables whose strong correlation (pearson >= 0.8)
#> - remove x1 : with x3 (0.825)
#> * remove variables whose strong correlation (spearman >= 0.8)
#> - remove x4 : with x5 (0.9649)
#> - remove x4 : with x6 (0.9928)
#> - remove x5 : with x6 (0.9485)
#> x2 x3 x6 x7
#> 1 2.554297 0.1939687 c b
#> 2 4.554297 2.1939687 d b
#> 3 9.554297 4.1939687 f a
#> 4 12.554297 6.1939687 g d
#> 5 10.554297 3.1939687 f e
#> 6 6.554297 10.1939687 h a
#> 7 7.554297 5.1939687 i d
#> 8 16.554297 6.1939687 j c
#> 9 27.554297 7.1939687 l a
#> 10 20.554297 18.1939687 m e
#> 11 11.554297 20.1939687 l d
#> 12 24.554297 22.1939687 n d
#> 13 13.554297 24.1939687 p c
#> 14 28.554297 12.1939687 q d
#> 15 30.554297 28.1939687 q d
#> 16 32.554297 30.1939687 q e
#> 17 51.554297 32.1939687 s a
#> 18 18.554297 16.1939687 t e
#> 19 19.554297 36.1939687 v a
#> 20 60.554297 38.1939687 v c
#> 21 42.554297 40.1939687 c d
#> 22 66.554297 42.1939687 d d
#> 23 46.554297 44.1939687 f a
#> 24 24.554297 46.1939687 g c
#> 25 75.554297 48.1939687 f d
#> 26 52.554297 50.1939687 h c
#> 27 27.554297 25.1939687 i a
#> 28 28.554297 54.1939687 j a
#> 29 29.554297 56.1939687 l b
#> 30 60.554297 58.1939687 m c
#> 31 93.554297 29.1939687 l c
#> 32 64.554297 62.1939687 n b
#> 33 99.554297 31.1939687 p e
#> 34 68.554297 32.1939687 q d
#> 35 105.554297 33.1939687 q a
#> 36 36.554297 70.1939687 q e
#> 37 37.554297 35.1939687 s d
#> 38 114.554297 74.1939687 t e
#> 39 39.554297 76.1939687 v e
#> 40 120.554297 38.1939687 v d
#> 41 123.554297 80.1939687 c a
#> 42 84.554297 40.1939687 d b
#> 43 129.554297 84.1939687 f d
#> 44 132.554297 86.1939687 g d
#> 45 45.554297 88.1939687 f b
#> 46 138.554297 44.1939687 h e
#> 47 47.554297 45.1939687 i c
#> 48 144.554297 46.1939687 j d
#> 49 98.554297 47.1939687 l c
#> 50 50.554297 98.1939687 m b
#> 51 153.554297 100.1939687 l b
#> 52 104.554297 102.1939687 n c
#> 53 53.554297 104.1939687 p c
#> 54 54.554297 106.1939687 q c
#> 55 55.554297 53.1939687 q a
#> 56 168.554297 110.1939687 q e
#> 57 114.554297 55.1939687 s b
#> 58 174.554297 56.1939687 t b
#> 59 59.554297 116.1939687 v c
#> 60 180.554297 58.1939687 v e
#> 61 183.554297 120.1939687 c b
#> 62 124.554297 122.1939687 d c
#> 63 126.554297 61.1939687 f c
#> 64 128.554297 126.1939687 g e
#> 65 130.554297 63.1939687 f e
#> 66 132.554297 64.1939687 h d
#> 67 67.554297 132.1939687 i d
#> 68 68.554297 66.1939687 j b
#> 69 207.554297 67.1939687 l a
#> 70 140.554297 138.1939687 m c
#> 71 213.554297 140.1939687 l a
#> 72 144.554297 70.1939687 n d
#> 73 73.554297 71.1939687 p a
#> 74 222.554297 72.1939687 q d
#> 75 75.554297 148.1939687 q c
#> 76 152.554297 150.1939687 q c
#> 77 154.554297 152.1939687 s e
#> 78 234.554297 76.1939687 t d
#> 79 158.554297 156.1939687 v d
#> 80 160.554297 78.1939687 v d
#> 81 162.554297 79.1939687 c d
#> 82 246.554297 162.1939687 d a
#> 83 166.554297 81.1939687 f e
#> 84 168.554297 166.1939687 g a
#> 85 255.554297 83.1939687 f b
#> 86 172.554297 170.1939687 h e
#> 87 87.554297 85.1939687 i b
#> 88 264.554297 174.1939687 j a
#> 89 267.554297 176.1939687 l d
#> 90 90.554297 88.1939687 m a
#> 91 91.554297 89.1939687 l b
#> 92 276.554297 90.1939687 n a
#> 93 279.554297 91.1939687 p d
#> 94 188.554297 186.1939687 q c
#> 95 190.554297 93.1939687 q e
#> 96 96.554297 190.1939687 q c
#> 97 97.554297 192.1939687 s a
#> 98 294.554297 96.1939687 t e
#> 99 198.554297 196.1939687 v d
#> 100 100.554297 198.1939687 v a
# not removing variables
treatment_corr(exam, treat = FALSE)
#> * remove variables whose strong correlation (pearson >= 0.8)
#> - remove x1 : with x3 (0.825)
#> * remove variables whose strong correlation (spearman >= 0.8)
#> - remove x4 : with x5 (0.9649)
#> - remove x4 : with x6 (0.9928)
#> - remove x5 : with x6 (0.9485)
# Set a threshold to detecting variables when correlation greater then 0.9
treatment_corr(exam, corr_thres = 0.9, treat = FALSE)
#> * remove variables whose strong correlation (spearman >= 0.9)
#> - remove x4 : with x5 (0.9649)
#> - remove x4 : with x6 (0.9928)
#> - remove x5 : with x6 (0.9485)
# not verbose mode
treatment_corr(exam, verbose = FALSE)
#> x2 x3 x6 x7
#> 1 2.554297 0.1939687 c b
#> 2 4.554297 2.1939687 d b
#> 3 9.554297 4.1939687 f a
#> 4 12.554297 6.1939687 g d
#> 5 10.554297 3.1939687 f e
#> 6 6.554297 10.1939687 h a
#> 7 7.554297 5.1939687 i d
#> 8 16.554297 6.1939687 j c
#> 9 27.554297 7.1939687 l a
#> 10 20.554297 18.1939687 m e
#> 11 11.554297 20.1939687 l d
#> 12 24.554297 22.1939687 n d
#> 13 13.554297 24.1939687 p c
#> 14 28.554297 12.1939687 q d
#> 15 30.554297 28.1939687 q d
#> 16 32.554297 30.1939687 q e
#> 17 51.554297 32.1939687 s a
#> 18 18.554297 16.1939687 t e
#> 19 19.554297 36.1939687 v a
#> 20 60.554297 38.1939687 v c
#> 21 42.554297 40.1939687 c d
#> 22 66.554297 42.1939687 d d
#> 23 46.554297 44.1939687 f a
#> 24 24.554297 46.1939687 g c
#> 25 75.554297 48.1939687 f d
#> 26 52.554297 50.1939687 h c
#> 27 27.554297 25.1939687 i a
#> 28 28.554297 54.1939687 j a
#> 29 29.554297 56.1939687 l b
#> 30 60.554297 58.1939687 m c
#> 31 93.554297 29.1939687 l c
#> 32 64.554297 62.1939687 n b
#> 33 99.554297 31.1939687 p e
#> 34 68.554297 32.1939687 q d
#> 35 105.554297 33.1939687 q a
#> 36 36.554297 70.1939687 q e
#> 37 37.554297 35.1939687 s d
#> 38 114.554297 74.1939687 t e
#> 39 39.554297 76.1939687 v e
#> 40 120.554297 38.1939687 v d
#> 41 123.554297 80.1939687 c a
#> 42 84.554297 40.1939687 d b
#> 43 129.554297 84.1939687 f d
#> 44 132.554297 86.1939687 g d
#> 45 45.554297 88.1939687 f b
#> 46 138.554297 44.1939687 h e
#> 47 47.554297 45.1939687 i c
#> 48 144.554297 46.1939687 j d
#> 49 98.554297 47.1939687 l c
#> 50 50.554297 98.1939687 m b
#> 51 153.554297 100.1939687 l b
#> 52 104.554297 102.1939687 n c
#> 53 53.554297 104.1939687 p c
#> 54 54.554297 106.1939687 q c
#> 55 55.554297 53.1939687 q a
#> 56 168.554297 110.1939687 q e
#> 57 114.554297 55.1939687 s b
#> 58 174.554297 56.1939687 t b
#> 59 59.554297 116.1939687 v c
#> 60 180.554297 58.1939687 v e
#> 61 183.554297 120.1939687 c b
#> 62 124.554297 122.1939687 d c
#> 63 126.554297 61.1939687 f c
#> 64 128.554297 126.1939687 g e
#> 65 130.554297 63.1939687 f e
#> 66 132.554297 64.1939687 h d
#> 67 67.554297 132.1939687 i d
#> 68 68.554297 66.1939687 j b
#> 69 207.554297 67.1939687 l a
#> 70 140.554297 138.1939687 m c
#> 71 213.554297 140.1939687 l a
#> 72 144.554297 70.1939687 n d
#> 73 73.554297 71.1939687 p a
#> 74 222.554297 72.1939687 q d
#> 75 75.554297 148.1939687 q c
#> 76 152.554297 150.1939687 q c
#> 77 154.554297 152.1939687 s e
#> 78 234.554297 76.1939687 t d
#> 79 158.554297 156.1939687 v d
#> 80 160.554297 78.1939687 v d
#> 81 162.554297 79.1939687 c d
#> 82 246.554297 162.1939687 d a
#> 83 166.554297 81.1939687 f e
#> 84 168.554297 166.1939687 g a
#> 85 255.554297 83.1939687 f b
#> 86 172.554297 170.1939687 h e
#> 87 87.554297 85.1939687 i b
#> 88 264.554297 174.1939687 j a
#> 89 267.554297 176.1939687 l d
#> 90 90.554297 88.1939687 m a
#> 91 91.554297 89.1939687 l b
#> 92 276.554297 90.1939687 n a
#> 93 279.554297 91.1939687 p d
#> 94 188.554297 186.1939687 q c
#> 95 190.554297 93.1939687 q e
#> 96 96.554297 190.1939687 q c
#> 97 97.554297 192.1939687 s a
#> 98 294.554297 96.1939687 t e
#> 99 198.554297 196.1939687 v d
#> 100 100.554297 198.1939687 v a