整洁计算
tidyverse 代码之所以 “整洁、优雅”,是因为它内部采用了一套整洁计算框架。
- 数据屏蔽与整洁选择
- {{ }}(curly-curly 算符) : 若只是传递,可将 “冻结 + 注入” 合成一步
- enquo() 和!!(引用与反引用):不只是传递,冻结后还需要访问
{{}}运算符
- 自定义函数时,想要像 tidyverse 那样整洁传递变量名,需要用到{{ }}, 即用两个大括号将变量括起来:
library(tidyverse)
var_summary = function(data, var){
data %>%
summarise(n = n(), mean = mean({{var}}))
}
mtcars %>%
group_by(cyl) %>%
var_summary(mpg)
# A tibble: 3 × 3
cyl n mean
<dbl> <int> <dbl>
1 4 11 26.7
2 6 7 19.7
3 8 14 15.1
- 若想用字符串形式传递变量名,在访问数据时需要借助.data[[var]],这里.data 是代替数据集
var_summary = function(data, var){
data %>%
summarise(n = n(), mean = mean(.data[[var]]))
}
mtcars %>%
group_by(cyl) %>%
var_summary("mpg")
# A tibble: 3 × 3
cyl n mean
<dbl> <int> <dbl>
1 4 11 26.7
2 6 7 19.7
3 8 14 15.1
- 还可用于对列名向量做循环迭代,比如对因子型各列计算各水平值频数:
mtcars[, 9:10] %>%
names() %>%
map(~ count(mtcars, .data[[.x]]))
[[1]]
am n
1 0 19
2 1 13
[[2]]
gear n
1 3 15
2 4 12
3 5 5
- 将整洁选择作为函数参数传递,也需要用到 {{ }}
summarise_mean = function(data, vars){
data %>%
summarise(n = n(), across({{vars}}, mean))
}
mtcars %>%
group_by(cyl) %>%
summarise_mean(where(is.numeric))
# A tibble: 3 × 12
cyl n mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 4 11 26.7 105. 82.6 4.07 2.29 19.1 0.909 0.727 4.09 1.55
2 6 7 19.7 183. 122. 3.59 3.12 18.0 0.571 0.429 3.86 3.43
3 8 14 15.1 353. 209. 3.23 4.00 16.8 0 0.143 3.29 3.5
- 若传递是多个列名构成的字符向量,则需要借助函数 all_of() 或 any_of()
vars = c("mpg", "vs")
mtcars %>% select(all_of(vars))
mtcars %>% select(!all_of(vars))
my_summarise = function(data, mean_var, sd_var) {
data %>%
summarise("mean_{{mean_var}}" := mean({{mean_var}}),
"sd_{{sd_var}}" := mean({{sd_var}}))
}
mtcars %>%
group_by(cyl) %>%
my_summarise(mpg, disp)
# A tibble: 3 × 3
cyl mean_mpg sd_disp
<dbl> <dbl> <dbl>
1 4 26.7 105.
2 6 19.7 183.
3 8 15.1 353.
my_summarise <- function(data, group_var, summarise_var) {
data %>%
group_by(across({{ group_var }})) %>%
summarise(across({{ summarise_var }}, mean,
.names = "mean_{.col}"
))
}
mtcars %>%
my_summarise(c(am, cyl), where(is.numeric))
`summarise()` has grouped output by 'am'. You can override using the `.groups`
argument.
# A tibble: 6 × 11
# Groups: am [2]
am cyl mean_mpg mean_disp mean_hp mean_…¹ mean_wt mean_…² mean_vs mean_…³
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 4 22.9 136. 84.7 3.77 2.94 21.0 1 3.67
2 0 6 19.1 205. 115. 3.42 3.39 19.2 1 3.5
3 0 8 15.0 358. 194. 3.12 4.10 17.1 0 3
4 1 4 28.1 93.6 81.9 4.18 2.04 18.4 0.875 4.25
5 1 6 20.6 155 132. 3.81 2.76 16.3 0 4.33
6 1 8 15.4 326 300. 3.88 3.37 14.6 0 5
# … with 1 more variable: mean_carb <dbl>, and abbreviated variable names
# ¹mean_drat, ²mean_qsec, ³mean_gear
var_summary <- function(data, var) {
data %>%
summarise(
n = n(),
!!enquo(var) := mean(.data[[var]])
)
}
mtcars %>%
group_by(cyl) %>%
var_summary("mpg")
# A tibble: 3 × 3
cyl n mpg
<dbl> <int> <dbl>
1 4 11 26.7
2 6 7 19.7
3 8 14 15.1
var_summary <- function(data, var) {
data %>%
summarise(
n = n(),
!!str_c("mean_", var) := mean(.data[[var]])
)
}
mtcars %>%
group_by(cyl) %>%
var_summary("mpg")
# A tibble: 3 × 3
cyl n mean_mpg
<dbl> <int> <dbl>
1 4 11 26.7
2 6 7 19.7
3 8 14 15.1
引用与反引用
引用与反引用将冻结和注入分成两步,在使用上更加灵活:
用 enquo() 让函数自动引用其参数
用!! 反引用该参数
以自定义计算分组均值函数为例:
grouped_mean <- function(data, summary_var, group_var) {
summary_var <- enquo(summary_var)
group_var <- enquo(group_var)
data %>%
group_by(!!group_var) %>%
summarise(mean = mean(!!summary_var))
}
grouped_mean(mtcars, mpg, cyl)
# A tibble: 3 × 2
cyl mean
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
- 要想修改结果列名,可借助 as_label() 从引用中提取名字
grouped_mean <- function(data, summary_var, group_var) {
summary_var <- enquo(summary_var)
group_var <- enquo(group_var)
summary_nm <- str_c("mean_", as_label(summary_var))
group_nm <- str_c("group_", as_label(group_var))
data %>%
group_by(!!group_nm := !!group_var) %>%
summarise(!!summary_nm := mean(!!summary_var))
}
grouped_mean(mtcars, mpg, cyl)
# A tibble: 3 × 2
group_cyl mean_mpg
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
- 要传递多个参数可以用特殊参数…。比如,还想让用于计算分组均值的group_var 可以是任意多个
grouped_mean <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
.data %>%
group_by(...) %>%
summarise(mean = mean(!!summary_var))
}
grouped_mean(mtcars, disp, cyl, am)
`summarise()` has grouped output by 'cyl'. You can override using the `.groups`
argument.
# A tibble: 6 × 3
# Groups: cyl [3]
cyl am mean
<dbl> <dbl> <dbl>
1 4 0 136.
2 4 1 93.6
3 6 0 205.
4 6 1 155
5 8 0 358.
6 8 1 326
- … 参数不需要做引用和反引用就能正确工作,但若要修改结果列名仍需要借助引用和反引用,但是要用 enques() 和!!!
grouped_mean <- function(.data, .summary_var, ...) {
summary_var <- enquo(.summary_var)
group_vars <- enquos(..., .named = TRUE)
summary_nm <- str_c("avg_", as_label(summary_var))
names(group_vars) <- str_c("groups_", names(group_vars))
.data %>%
group_by(!!!group_vars) %>%
summarise(!!summary_nm := mean(!!summary_var))
}
grouped_mean(mtcars, disp, cyl, am)
`summarise()` has grouped output by 'groups_cyl'. You can override using the
`.groups` argument.
# A tibble: 6 × 3
# Groups: groups_cyl [3]
groups_cyl groups_am avg_disp
<dbl> <dbl> <dbl>
1 4 0 136.
2 4 1 93.6
3 6 0 205.
4 6 1 155
5 8 0 358.
6 8 1 326
filter_fun = function(df, ...){
filter(df, ...)
}
mtcars %>%
filter_fun(mpg > 25 & disp > 90)
mpg cyl disp hp drat wt qsec vs am gear carb
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.7 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
scatter_plot = function(df, x_var,y_var) {
x_var = enquo(x_var)
y_var = enquo(y_var)
ggplot(data = df, aes(x = !!x_var, y = !!y_var)) +
geom_point() +
theme_bw() +
# theme(plot.title = element_text(lineheight = 1, face = "b
geom_smooth() +
ggtitle(str_c(as_label(y_var), " vs. ", as_label(x_var)))
}
scatter_plot(mtcars, disp, hp)
鄂ICP备2022016232号-1