Introduction#
We’re delighted to announce the release of dplyr 0.8.2 on CRAN 🍉 !
This is a minor maintenance release in the 0.8.* series, addressing a collection of
issues since the
0.8.1
and
0.8.0
versions.
top_n() and top_frac()#
top_n()
has been around for a long time in
dplyr
, as a convenient wrapper around
filter()
and
min_rank()
,
to select top (or bottom) entries in each group of a tibble.
In this release,
top_n()
is no longer
limited to a constant number of entries per group, its n argument is now quoted
to be evaluated later in the context of the group.
Here are the top half countries, i.e. n() / 2, in terms of life expectancy in 2007.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
gapminder::gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
top_n(n() / 2, lifeExp)
#> # A tibble: 70 x 6
#> # Groups: continent [5]
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Algeria Africa 2007 72.3 33333216 6223.
#> 2 Argentina Americas 2007 75.3 40301927 12779.
#> 3 Australia Oceania 2007 81.2 20434176 34435.
#> 4 Austria Europe 2007 79.8 8199783 36126.
#> 5 Bahrain Asia 2007 75.6 708573 29796.
#> 6 Belgium Europe 2007 79.4 10392226 33693.
#> 7 Benin Africa 2007 56.7 8078314 1441.
#> 8 Canada Americas 2007 80.7 33390141 36319.
#> 9 Chile Americas 2007 78.6 16284741 13172.
#> 10 China Asia 2007 73.0 1318683096 4959.
#> # … with 60 more rows
|
top_frac()
is new convenience shortcut for
the top n percent, i.e.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
gapminder::gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
top_frac(0.5, lifeExp)
#> # A tibble: 70 x 6
#> # Groups: continent [5]
#> country continent year lifeExp pop gdpPercap
#> <fct> <fct> <int> <dbl> <int> <dbl>
#> 1 Algeria Africa 2007 72.3 33333216 6223.
#> 2 Argentina Americas 2007 75.3 40301927 12779.
#> 3 Australia Oceania 2007 81.2 20434176 34435.
#> 4 Austria Europe 2007 79.8 8199783 36126.
#> 5 Bahrain Asia 2007 75.6 708573 29796.
#> 6 Belgium Europe 2007 79.4 10392226 33693.
#> 7 Benin Africa 2007 56.7 8078314 1441.
#> 8 Canada Americas 2007 80.7 33390141 36319.
#> 9 Chile Americas 2007 78.6 16284741 13172.
#> 10 China Asia 2007 73.0 1318683096 4959.
#> # … with 60 more rows
|
tbl_vars() and group_cols()#
tbl_vars()
now returns a dplyr_sel_vars
object that keeps track of the grouping variables. This information powers
group_cols()
, which can now be used
in every function that uses tidy selection of columns.
Functions in the tidyverse and beyond may start to use the
tbl_vars()
/
group_cols()
duo,
starting from
tidyr
and this
pull request
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
# pak::pkg_install("tidyverse/tidyr#668")
iris %>%
group_by(Species) %>%
tidyr::gather("flower_att", "measurement", -group_cols())
#> # A tibble: 600 x 3
#> # Groups: Species [3]
#> Species flower_att measurement
#> <fct> <chr> <dbl>
#> 1 setosa Sepal.Length 5.1
#> 2 setosa Sepal.Length 4.9
#> 3 setosa Sepal.Length 4.7
#> 4 setosa Sepal.Length 4.6
#> 5 setosa Sepal.Length 5
#> 6 setosa Sepal.Length 5.4
#> 7 setosa Sepal.Length 4.6
#> 8 setosa Sepal.Length 5
#> 9 setosa Sepal.Length 4.4
#> 10 setosa Sepal.Length 4.9
#> # … with 590 more rows
|
group_split(), group_map(), group_modify()#
group_split()
always keeps
a ptype attribute to track the prototype of the splits.
1
2
3
4
5
6
7
8
9
10
|
mtcars %>%
group_by(cyl) %>%
filter(hp < 0) %>%
group_split()
#> list()
#> attr(,"ptype")
#> # A tibble: 0 x 11
#> # … with 11 variables: mpg <dbl>, cyl <dbl>, disp <dbl>, hp <dbl>,
#> # drat <dbl>, wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>,
#> # carb <dbl>
|
group_map()
and
group_modify()
benefit from this in the edge case where there are no groups.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
mtcars %>%
group_by(cyl) %>%
filter(hp < 0) %>%
group_map(~.x)
#> list()
#> attr(,"ptype")
#> # A tibble: 0 x 10
#> # … with 10 variables: mpg <dbl>, disp <dbl>, hp <dbl>, drat <dbl>,
#> # wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl>
mtcars %>%
group_by(cyl) %>%
filter(hp < 0) %>%
group_modify(~.x)
#> # A tibble: 0 x 11
#> # Groups: cyl [0]
#> # … with 11 variables: cyl <dbl>, mpg <dbl>, disp <dbl>, hp <dbl>,
#> # drat <dbl>, wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>,
#> # carb <dbl>
|
Thanks#
Thanks to all contributors for this release.
@abirasathiy
,
@ajkroeg
,
@alejandroschuler
,
@anuj2054
,
@arider2
,
@arielfuentes
,
@artidata
,
@BenPVD
,
@bkmontgom
,
@brodieG
,
@cderv
,
@clanker
,
@clemenshug
,
@CSheehan1
,
@danielecook
,
@dannyparsons
,
@daskandalis
,
@davidbaniadam
,
@DavisVaughan
,
@deliciouslytyped
,
@earowang
,
@fkatharina
,
@hadley
,
@Hardervidertsie
,
@iago-pssjd
,
@IndrajeetPatil
,
@jackdolgin
,
@jangorecki
,
@jimhester
,
@jjesusfilho
,
@jonjhitchcock
,
@jxu
,
@krlmlr
,
@laresbernardo
,
@lionel-
,
@LukeGoodsell
,
@madmark81
,
@MarkusBerroth
,
@matheus-donato
,
@mattfidler
,
@MatthieuStigler
,
@md0u80c9
,
@michaelhogersosis
,
@MikeJohnPage
,
@MJL9588
,
@moodymudskipper
,
@mwillumz
,
@Nelson-Gon
,
@qdread
,
@randomgambit
,
@rcorty
,
@romainfrancois
,
@romatik
,
@spressi
,
@sstoeckl
,
@stephLH
,
@urskalbitzer
,
@vpanfilov
, and
@ZahraEconomist
.