R/correlate.R
, R/tbl_dbi.R
correlate.data.frame.Rd
The correlate() compute the correlation coefficient for numerical or categorical data.
correlate(.data, ...)
# S3 method for data.frame
correlate(
.data,
...,
method = c("pearson", "kendall", "spearman", "cramer", "theil")
)
# S3 method for grouped_df
correlate(
.data,
...,
method = c("pearson", "kendall", "spearman", "cramer", "theil")
)
# S3 method for tbl_dbi
correlate(
.data,
...,
method = c("pearson", "kendall", "spearman", "cramer", "theil"),
in_database = FALSE,
collect_size = Inf
)
a data.frame or a grouped_df
or a tbl_dbi.
one or more unquoted expressions separated by commas. You can treat variable names like they are positions. Positive values select variables; negative values to drop variables. If the first expression is negative, correlate() will automatically start with all variables. These arguments are automatically quoted and evaluated in a context where column names represent column positions. They support unquoting and splicing.
See vignette("EDA") for an introduction to these concepts.
a character string indicating which correlation coefficient (or covariance) is to be computed. One of "pearson" (default), "kendall", or "spearman": can be abbreviated. For numerical variables, one of "pearson" (default), "kendall", or "spearman": can be used as an abbreviation. For categorical variables, "cramer" and "theil" can be used. "cramer" computes Cramer's V statistic, "theil" computes Theil's U statistic.
Specifies whether to perform in-database operations. If TRUE, most operations are performed in the DBMS. if FALSE, table data is taken in R and operated in-memory. Not yet supported in_database = TRUE.
a integer. The number of data samples from the DBMS to R. Applies only if in_database = FALSE.
This function is useful when used with the group_by() function of the dplyr package.
If you want to compute by level of the categorical data you are interested in,
rather than the whole observation, you can use grouped_df
as the group_by() function.
This function is computed stats::cor() function by use = "pairwise.complete.obs" option for numerical variable.
And support categorical variable with theil's U correlation coefficient and Cramer's V correlation coefficient.
It returns data.frame with the following variables.:
var1 : names of numerical variable
var2 : name of the corresponding numeric variable
coef_corr : Correlation coefficient
When method = "cramer", data.frame with the following variables is returned.
var1 : names of numerical variable
var2 : name of the corresponding numeric variable
chisq : the value the chi-squared test statistic
df : the degrees of freedom of the approximate chi-squared distribution of the test statistic
pval : the p-value for the test
coef_corr : theil's U correlation coefficient (Uncertainty Coefficient).
# \donttest{
# Correlation coefficients of all numerical variables
tab_corr <- correlate(heartfailure)
tab_corr
#> # A tibble: 42 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 cpk_enzyme age -0.0814
#> 2 ejection_fraction age 0.0602
#> 3 platelets age -0.0525
#> 4 creatinine age 0.159
#> 5 sodium age -0.0459
#> 6 time age -0.224
#> 7 age cpk_enzyme -0.0814
#> 8 ejection_fraction cpk_enzyme -0.0441
#> 9 platelets cpk_enzyme 0.0245
#> 10 creatinine cpk_enzyme -0.0164
#> # … with 32 more rows
mat_corr <- summary(tab_corr)
#> * correlation type : generic
#> * variable type : numeric
#> * correlation method : pearson
#>
#> * Matrix of Correlation
#> age cpk_enzyme ejection_fraction platelets
#> age 1.00000000 -0.081406394 0.06019547 -0.05247529
#> cpk_enzyme -0.08140639 1.000000000 -0.04407955 0.02446339
#> ejection_fraction 0.06019547 -0.044079554 1.00000000 0.07217747
#> platelets -0.05247529 0.024463389 0.07217747 1.00000000
#> creatinine 0.15923697 -0.016408480 -0.01130247 -0.04119808
#> sodium -0.04591178 0.059550156 0.17590228 0.06212462
#> time -0.22426485 -0.009345653 0.04172924 0.01051391
#> creatinine sodium time
#> age 0.15923697 -0.04591178 -0.224264849
#> cpk_enzyme -0.01640848 0.05955016 -0.009345653
#> ejection_fraction -0.01130247 0.17590228 0.041729235
#> platelets -0.04119808 0.06212462 0.010513909
#> creatinine 1.00000000 -0.18909521 -0.149315418
#> sodium -0.18909521 1.00000000 0.087640000
#> time -0.14931542 0.08764000 1.000000000
mat_corr
#> age cpk_enzyme ejection_fraction platelets
#> age 1.00000000 -0.081406394 0.06019547 -0.05247529
#> cpk_enzyme -0.08140639 1.000000000 -0.04407955 0.02446339
#> ejection_fraction 0.06019547 -0.044079554 1.00000000 0.07217747
#> platelets -0.05247529 0.024463389 0.07217747 1.00000000
#> creatinine 0.15923697 -0.016408480 -0.01130247 -0.04119808
#> sodium -0.04591178 0.059550156 0.17590228 0.06212462
#> time -0.22426485 -0.009345653 0.04172924 0.01051391
#> creatinine sodium time
#> age 0.15923697 -0.04591178 -0.224264849
#> cpk_enzyme -0.01640848 0.05955016 -0.009345653
#> ejection_fraction -0.01130247 0.17590228 0.041729235
#> platelets -0.04119808 0.06212462 0.010513909
#> creatinine 1.00000000 -0.18909521 -0.149315418
#> sodium -0.18909521 1.00000000 0.087640000
#> time -0.14931542 0.08764000 1.000000000
plot(tab_corr)
# Select the variable to compute
correlate(heartfailure, "creatinine", "sodium")
#> # A tibble: 12 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 creatinine age 0.159
#> 2 sodium age -0.0459
#> 3 creatinine cpk_enzyme -0.0164
#> 4 sodium cpk_enzyme 0.0596
#> 5 creatinine ejection_fraction -0.0113
#> 6 sodium ejection_fraction 0.176
#> 7 creatinine platelets -0.0412
#> 8 sodium platelets 0.0621
#> 9 sodium creatinine -0.189
#> 10 creatinine sodium -0.189
#> 11 creatinine time -0.149
#> 12 sodium time 0.0876
# Non-parametric correlation coefficient by kendall method
correlate(heartfailure, creatinine, method = "kendall")
#> # A tibble: 6 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 creatinine age 0.191
#> 2 creatinine cpk_enzyme -0.0351
#> 3 creatinine ejection_fraction -0.130
#> 4 creatinine platelets -0.0357
#> 5 creatinine sodium -0.223
#> 6 creatinine time -0.110
# theil's U correlation coefficient (Uncertainty Coefficient)
tab_corr <- correlate(heartfailure, anaemia, hblood_pressure, method = "theil")
tab_corr
#> var1 var2 coef_corr
#> 1 hblood_pressure anaemia 0.0010925763
#> 2 anaemia diabetes 0.0001188941
#> 3 hblood_pressure diabetes 0.0001222019
#> 4 anaemia hblood_pressure 0.0010925763
#> 5 anaemia sex 0.0067211188
#> 6 hblood_pressure sex 0.0083558270
#> 7 anaemia smoking 0.0088747588
#> 8 hblood_pressure smoking 0.0024572165
#> 9 anaemia death_event 0.0033373058
#> 10 hblood_pressure death_event 0.0048836271
summary(tab_corr)
#> * correlation type : generic
#> * variable type : categorical
#> * correlation method : theil
#>
#> * Matrix of Correlation
#> anaemia diabetes hblood_pressure sex
#> anaemia 1.000000000 0.0001188941 0.001092576 0.006721119
#> hblood_pressure 0.001092576 0.0001222019 1.000000000 0.008355827
#> smoking death_event
#> anaemia 0.008874759 0.003337306
#> hblood_pressure 0.002457216 0.004883627
plot(tab_corr)
# Using dplyr::grouped_dt
library(dplyr)
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
gdata <- group_by(heartfailure, smoking, death_event)
correlate(gdata)
#> # A tibble: 168 × 5
#> smoking death_event var1 var2 coef_corr
#> <fct> <fct> <fct> <fct> <dbl>
#> 1 No No cpk_enzyme age -0.0393
#> 2 No No ejection_fraction age 0.0749
#> 3 No No platelets age -0.0579
#> 4 No No creatinine age 0.199
#> 5 No No sodium age -0.0427
#> 6 No No time age -0.0193
#> 7 No No age cpk_enzyme -0.0393
#> 8 No No ejection_fraction cpk_enzyme -0.0819
#> 9 No No platelets cpk_enzyme 0.0610
#> 10 No No creatinine cpk_enzyme -0.0339
#> # … with 158 more rows
# Using pipes ---------------------------------
# Correlation coefficients of all numerical variables
heartfailure %>%
correlate()
#> # A tibble: 42 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 cpk_enzyme age -0.0814
#> 2 ejection_fraction age 0.0602
#> 3 platelets age -0.0525
#> 4 creatinine age 0.159
#> 5 sodium age -0.0459
#> 6 time age -0.224
#> 7 age cpk_enzyme -0.0814
#> 8 ejection_fraction cpk_enzyme -0.0441
#> 9 platelets cpk_enzyme 0.0245
#> 10 creatinine cpk_enzyme -0.0164
#> # … with 32 more rows
# Non-parametric correlation coefficient by spearman method
heartfailure %>%
correlate(creatinine, sodium, method = "spearman")
#> # A tibble: 12 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 creatinine age 0.271
#> 2 sodium age -0.101
#> 3 creatinine cpk_enzyme -0.0499
#> 4 sodium cpk_enzyme 0.0169
#> 5 creatinine ejection_fraction -0.178
#> 6 sodium ejection_fraction 0.162
#> 7 creatinine platelets -0.0510
#> 8 sodium platelets 0.0495
#> 9 sodium creatinine -0.300
#> 10 creatinine sodium -0.300
#> 11 creatinine time -0.161
#> 12 sodium time 0.0864
# ---------------------------------------------
# Correlation coefficient
# that eliminates redundant combination of variables
heartfailure %>%
correlate() %>%
filter(as.integer(var1) > as.integer(var2))
#> # A tibble: 21 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 cpk_enzyme age -0.0814
#> 2 ejection_fraction age 0.0602
#> 3 platelets age -0.0525
#> 4 creatinine age 0.159
#> 5 sodium age -0.0459
#> 6 time age -0.224
#> 7 ejection_fraction cpk_enzyme -0.0441
#> 8 platelets cpk_enzyme 0.0245
#> 9 creatinine cpk_enzyme -0.0164
#> 10 sodium cpk_enzyme 0.0596
#> # … with 11 more rows
# Using pipes & dplyr -------------------------
# Compute the correlation coefficient of 'creatinine' variable by 'smoking'
# and 'death_event' variables. And extract only those with absolute
# value of correlation coefficient is greater than 0.2
heartfailure %>%
group_by(smoking, death_event) %>%
correlate(creatinine) %>%
filter(abs(coef_corr) >= 0.2)
#> # A tibble: 7 × 5
#> smoking death_event var1 var2 coef_corr
#> <fct> <fct> <fct> <fct> <dbl>
#> 1 No Yes creatinine ejection_fraction 0.298
#> 2 Yes No creatinine ejection_fraction -0.201
#> 3 Yes No creatinine sodium -0.290
#> 4 Yes No creatinine time 0.246
#> 5 Yes Yes creatinine age 0.255
#> 6 Yes Yes creatinine sodium -0.286
#> 7 Yes Yes creatinine time -0.201
# extract only those with 'smoking' variable level is "Yes",
# and compute the correlation coefficient of 'Sales' variable
# by 'hblood_pressure' and 'death_event' variables.
# And the correlation coefficient is negative and smaller than 0.5
heartfailure %>%
filter(smoking == "Yes") %>%
group_by(hblood_pressure, death_event) %>%
correlate(creatinine) %>%
filter(coef_corr < 0) %>%
filter(abs(coef_corr) > 0.5)
#> # A tibble: 1 × 5
#> hblood_pressure death_event var1 var2 coef_corr
#> <fct> <fct> <fct> <fct> <dbl>
#> 1 Yes Yes creatinine sodium -0.561
# }
# \donttest{
library(dplyr)
# connect DBMS
con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
# copy heartfailure to the DBMS with a table named TB_HEARTFAILURE
copy_to(con_sqlite, heartfailure, name = "TB_HEARTFAILURE", overwrite = TRUE)
# Using pipes ---------------------------------
# Correlation coefficients of all numerical variables
con_sqlite %>%
tbl("TB_HEARTFAILURE") %>%
correlate()
#> # A tibble: 42 × 3
#> var1 var2 coef_corr
#> <fct> <fct> <dbl>
#> 1 cpk_enzyme age -0.0814
#> 2 ejection_fraction age 0.0602
#> 3 platelets age -0.0525
#> 4 creatinine age 0.159
#> 5 sodium age -0.0459
#> 6 time age -0.224
#> 7 age cpk_enzyme -0.0814
#> 8 ejection_fraction cpk_enzyme -0.0441
#> 9 platelets cpk_enzyme 0.0245
#> 10 creatinine cpk_enzyme -0.0164
#> # … with 32 more rows
# Using pipes & dplyr -------------------------
# Compute the correlation coefficient of creatinine variable by 'hblood_pressure'
# and 'death_event' variables.
con_sqlite %>%
tbl("TB_HEARTFAILURE") %>%
group_by(hblood_pressure, death_event) %>%
correlate(creatinine)
#> # A tibble: 24 × 5
#> hblood_pressure death_event var1 var2 coef_corr
#> <chr> <chr> <fct> <fct> <dbl>
#> 1 No No creatinine age 0.164
#> 2 No No creatinine cpk_enzyme -0.0252
#> 3 No No creatinine ejection_fraction -0.0944
#> 4 No No creatinine platelets -0.0124
#> 5 No No creatinine sodium -0.252
#> 6 No No creatinine time -0.0382
#> 7 No Yes creatinine age -0.0200
#> 8 No Yes creatinine cpk_enzyme 0.0488
#> 9 No Yes creatinine ejection_fraction 0.170
#> 10 No Yes creatinine platelets -0.0202
#> # … with 14 more rows
# Disconnect DBMS
DBI::dbDisconnect(con_sqlite)
# }