# in the console ONCE: install.packages("tidyverse")
# load the tidyverse library (or just dplyr)
# library(dplyr)
library(tidyverse)
Data frame manipulation with dplyr
Working with data frames using dplyr and the tidyverse
Let’s install and load the tidyverse package. Add some chunk options (#| warning: false
) to suppress noisy loading messages.
Use read_csv()
(a tidyverse version of read.csv()
) to load the gapminder dataset as a “tibble”
# use read_csv() to load gapminder
<- read_csv("data/gapminder.csv") gapminder
Rows: 1704 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): country, continent
dbl (4): year, lifeExp, pop, gdpPercap
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at gapminder (without head())
gapminder
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
The dplyr functions that we’re going to use
select()
: extract columns from your data framefilter()
: filter to rows of your data frame based on a conditionmutate()
: add columns or modify columns in your data framesummarize()
: aggregate information in your columnsgroup_by()
: perform an operation separately for each entry in a categorical column
Select()
We can use the select() function to extract specific named columns from our data frame
# use select() to look at just the country, year, lifeExp columns
select(gapminder, country, year, lifeExp)
# A tibble: 1,704 × 3
country year lifeExp
<chr> <dbl> <dbl>
1 Afghanistan 1952 28.8
2 Afghanistan 1957 30.3
3 Afghanistan 1962 32.0
4 Afghanistan 1967 34.0
5 Afghanistan 1972 36.1
6 Afghanistan 1977 38.4
7 Afghanistan 1982 39.9
8 Afghanistan 1987 40.8
9 Afghanistan 1992 41.7
10 Afghanistan 1997 41.8
# ℹ 1,694 more rows
We can also remove columns:
# use select() to remove continent, year, and pop columns
select(gapminder, -continent, -year, -pop)
# A tibble: 1,704 × 3
country lifeExp gdpPercap
<chr> <dbl> <dbl>
1 Afghanistan 28.8 779.
2 Afghanistan 30.3 821.
3 Afghanistan 32.0 853.
4 Afghanistan 34.0 836.
5 Afghanistan 36.1 740.
6 Afghanistan 38.4 786.
7 Afghanistan 39.9 978.
8 Afghanistan 40.8 852.
9 Afghanistan 41.7 649.
10 Afghanistan 41.8 635.
# ℹ 1,694 more rows
Select can also help you rename columns
# use select() to rename lifeExp to life_exp and gdpPercap to gdp_per_cap
# (keeping all other columns)
select(gapminder, country, continent, year, life_exp = lifeExp, pop, gdp_per_cap = gdpPercap)
# A tibble: 1,704 × 6
country continent year life_exp pop gdp_per_cap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
# use rename() to do the same thing more succinctly
rename(gapminder, life_exp = lifeExp, gdp_per_cap = gdpPercap)
# A tibble: 1,704 × 6
country continent year life_exp pop gdp_per_cap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
The pipe |>
(formerly %>%
)
The pipe greatly improves the readability of our code.
The pipe syntax is: object |> function()
. This places the object to the left of |>
into the first argument of the function to the right of |>
.
The following two pieces of code are equivalent:
# apply head() to gapminder directly
head(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
# apply head() to gapminder using the pipe
|> head() gapminder
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
Another example:
# apply select to gapminder, year, and pop without the pipe
select(gapminder, year, pop)
# A tibble: 1,704 × 2
year pop
<dbl> <dbl>
1 1952 8425333
2 1957 9240934
3 1962 10267083
4 1967 11537966
5 1972 13079460
6 1977 14880372
7 1982 12881816
8 1987 13867957
9 1992 16317921
10 1997 22227415
# ℹ 1,694 more rows
# apply select to gapminder, year, and pop with the pipe
|> select(year, pop) |> head() gapminder
# A tibble: 6 × 2
year pop
<dbl> <dbl>
1 1952 8425333
2 1957 9240934
3 1962 10267083
4 1967 11537966
5 1972 13079460
6 1977 14880372
Note: the pipe |>
is now a part of the R programming language. Previously, you needed to load the magrittr, dplyr, or tidyverse libraries to access the pipe and it had a different symbol: %>%
(there are very minor differences in functionality). This still works, but it is now recommended that you use the newer “native” pipe syntax: |>
.
Exercise
- Use the pipe to remove the continent column from gapminder. Save the output as an object called
gapminder_tmp
.
<- gapminder |> select(-continent) gapminder_tmp
- Modify your code by adding another pipe to rename the gdpPercap column as “gdp_per_cap”. You do not need to define a new object.
<- gapminder |>
gapminder_tmp # remove the continent column
select(-continent) |>
# rename gdp per cap
rename(gdp_per_cap = gdpPercap)
- Try to do this using the base R syntax (this is kind of hard!)
# base R
<- gapminder[, -2]
gapminder_tmp $gdp_per_cap <- gapminder_tmp$gdpPercap
gapminder_tmp<- gapminder_tmp[, -5]
gapminder_tmp gapminder_tmp
# A tibble: 1,704 × 5
country year lifeExp pop gdp_per_cap
<chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan 1952 28.8 8425333 779.
2 Afghanistan 1957 30.3 9240934 821.
3 Afghanistan 1962 32.0 10267083 853.
4 Afghanistan 1967 34.0 11537966 836.
5 Afghanistan 1972 36.1 13079460 740.
6 Afghanistan 1977 38.4 14880372 786.
7 Afghanistan 1982 39.9 12881816 978.
8 Afghanistan 1987 40.8 13867957 852.
9 Afghanistan 1992 41.7 16317921 649.
10 Afghanistan 1997 41.8 22227415 635.
# ℹ 1,694 more rows
Filtering rows using filter()
The filter function lets you filter to specific rows based on a condition.
Let’s filter to just the data from Australia.
Recall that you can apply logical operations to a vector as follows:
# asking which entries in the country column are equal to "Australia"
head(gapminder$country == "Australia", 100)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE
We can ask similar questions inside the filter function to use them to subset our rows. Note: no $
extraction required!
# Filtering just to Australia (without the pipe)
filter(gapminder, country == "Australia")
# A tibble: 12 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Australia Oceania 1952 69.1 8691212 10040.
2 Australia Oceania 1957 70.3 9712569 10950.
3 Australia Oceania 1962 70.9 10794968 12217.
4 Australia Oceania 1967 71.1 11872264 14526.
5 Australia Oceania 1972 71.9 13177000 16789.
6 Australia Oceania 1977 73.5 14074100 18334.
7 Australia Oceania 1982 74.7 15184200 19477.
8 Australia Oceania 1987 76.3 16257249 21889.
9 Australia Oceania 1992 77.6 17481977 23425.
10 Australia Oceania 1997 78.8 18565243 26998.
11 Australia Oceania 2002 80.4 19546792 30688.
12 Australia Oceania 2007 81.2 20434176 34435.
We can provide multiple conditions using a comma
# filter just to Australia and the years after 1970 (without the pipe)
filter(gapminder, country == "Australia", year > 1970)
# A tibble: 8 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Australia Oceania 1972 71.9 13177000 16789.
2 Australia Oceania 1977 73.5 14074100 18334.
3 Australia Oceania 1982 74.7 15184200 19477.
4 Australia Oceania 1987 76.3 16257249 21889.
5 Australia Oceania 1992 77.6 17481977 23425.
6 Australia Oceania 1997 78.8 18565243 26998.
7 Australia Oceania 2002 80.4 19546792 30688.
8 Australia Oceania 2007 81.2 20434176 34435.
We can write this code with the pipe
# filter just to Australia and the years after 1970 with the pipe
|> filter(country == "Australia", year > 1970) gapminder
# A tibble: 8 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Australia Oceania 1972 71.9 13177000 16789.
2 Australia Oceania 1977 73.5 14074100 18334.
3 Australia Oceania 1982 74.7 15184200 19477.
4 Australia Oceania 1987 76.3 16257249 21889.
5 Australia Oceania 1992 77.6 17481977 23425.
6 Australia Oceania 1997 78.8 18565243 26998.
7 Australia Oceania 2002 80.4 19546792 30688.
8 Australia Oceania 2007 81.2 20434176 34435.
Let’s combine filter and select
Filter to the continent of Africa, where the year is equal to 1992
Return just the country and lifeExp columns, where I rename lifeExp to be life_exp
# filter to the continent of Africa for the year 1992
# and then select just the country and lifeExp columns (with renamed life_exp)
|>
gapminder filter(continent == "Africa",
== 1992) |>
year select(country,
life_exp = lifeExp)
# A tibble: 52 × 2
country life_exp
<chr> <dbl>
1 Algeria 67.7
2 Angola 40.6
3 Benin 53.9
4 Botswana 62.7
5 Burkina Faso 50.3
6 Burundi 44.7
7 Cameroon 54.3
8 Central African Republic 49.4
9 Chad 51.7
10 Comoros 57.9
# ℹ 42 more rows
The order of operations can be fairly important.
# swap the filter and select steps above
|>
gapminder select(country,
life_exp = lifeExp) |>
filter(continent == "Africa",
== 1992) year
Error in `filter()`:
ℹ In argument: `continent == "Africa"`.
Caused by error:
! object 'continent' not found
Next, try to extract the data for Australia and Italy.
# extract the data for Australia and Italy
|> filter((country == "Australia") | (country == "Italy")) gapminder
# A tibble: 24 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Australia Oceania 1952 69.1 8691212 10040.
2 Australia Oceania 1957 70.3 9712569 10950.
3 Australia Oceania 1962 70.9 10794968 12217.
4 Australia Oceania 1967 71.1 11872264 14526.
5 Australia Oceania 1972 71.9 13177000 16789.
6 Australia Oceania 1977 73.5 14074100 18334.
7 Australia Oceania 1982 74.7 15184200 19477.
8 Australia Oceania 1987 76.3 16257249 21889.
9 Australia Oceania 1992 77.6 17481977 23425.
10 Australia Oceania 1997 78.8 18565243 26998.
# ℹ 14 more rows
Exercise
Filter gapminder to all countries on the Oceania continent for years 1987 and 1992 and select just the country, year, and gdpPercap columns (as gdp_per_cap).
Save the output in an object called gapminder_oceania.
<- gapminder |>
gapminder_oceania filter(continent == "Oceania", year %in% c(1987, 1992)) |>
select(country, year, gdp_per_cap = gdpPercap)
gapminder_oceania
# A tibble: 4 × 3
country year gdp_per_cap
<chr> <dbl> <dbl>
1 Australia 1987 21889.
2 Australia 1992 23425.
3 New Zealand 1987 19007.
4 New Zealand 1992 18363.
Adding and modifying columns using mutate()
Let’s use mutate to add a gdp
column to our data that is the product of pop
and gdpPercap
# use mutate to add a gdp column
|> mutate(gdp = pop * gdpPercap) gapminder
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap gdp
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330.
2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670.
3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797.
4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150.
5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274.
6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231.
7 Afghanistan Asia 1982 39.9 12881816 978. 12598563401.
8 Afghanistan Asia 1987 40.8 13867957 852. 11820990309.
9 Afghanistan Asia 1992 41.7 16317921 649. 10595901589.
10 Afghanistan Asia 1997 41.8 22227415 635. 14121995875.
# ℹ 1,694 more rows
The code above hasn’t actually modified gapminder.
# print gapminder
gapminder
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
Use mutate to round lifeExp to the nearest integer. Save the resulting df as gapminder_tmp
# use mutate to round lifeExp to the nearest integer.
# Save the resulting df as gapminder_tmp
<- gapminder |> mutate(lifeExp = round(lifeExp))
gapminder_tmp head(gapminder_tmp)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 29 8425333 779.
2 Afghanistan Asia 1957 30 9240934 821.
3 Afghanistan Asia 1962 32 10267083 853.
4 Afghanistan Asia 1967 34 11537966 836.
5 Afghanistan Asia 1972 36 13079460 740.
6 Afghanistan Asia 1977 38 14880372 786.
Exercise
Modify gapminder_tmp
using mutate() so that it contains a “expected year of death” column, e.g., deathExp = year + lifeExp
. Overwrite your gapminder_tmp
object with this new version.
<- gapminder_tmp |> mutate(deathExp = year + lifeExp)
gapminder_tmp gapminder_tmp
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap deathExp
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 29 8425333 779. 1981
2 Afghanistan Asia 1957 30 9240934 821. 1987
3 Afghanistan Asia 1962 32 10267083 853. 1994
4 Afghanistan Asia 1967 34 11537966 836. 2001
5 Afghanistan Asia 1972 36 13079460 740. 2008
6 Afghanistan Asia 1977 38 14880372 786. 2015
7 Afghanistan Asia 1982 40 12881816 978. 2022
8 Afghanistan Asia 1987 41 13867957 852. 2028
9 Afghanistan Asia 1992 42 16317921 649. 2034
10 Afghanistan Asia 1997 42 22227415 635. 2039
# ℹ 1,694 more rows
Summarizing data frames using summarize()
Let’s use summarize to compute the average lifeExp in the entire dataset
# compute the average lifeExp value
|> summarize(mean_life_exp = mean(lifeExp)) gapminder
# A tibble: 1 × 1
mean_life_exp
<dbl>
1 59.5
Combining summarize() with group_by()
Let’s compute the average life expectancy for each continent
# compute the average lifeExp value for each continent
|>
gapminder group_by(continent) |>
summarize(mean_life_exp = mean(lifeExp))
# A tibble: 5 × 2
continent mean_life_exp
<chr> <dbl>
1 Africa 48.9
2 Americas 64.7
3 Asia 60.1
4 Europe 71.9
5 Oceania 74.3
Exercise
Use group_by() and summarize() to compute the maximum population for each country.
|>
gapminder group_by(country) |>
summarize(max_pop = max(pop))
# A tibble: 142 × 2
country max_pop
<chr> <dbl>
1 Afghanistan 31889923
2 Albania 3600523
3 Algeria 33333216
4 Angola 12420476
5 Argentina 40301927
6 Australia 20434176
7 Austria 8199783
8 Bahrain 708573
9 Bangladesh 150448339
10 Belgium 10392226
# ℹ 132 more rows
Use group_by() with mutate()
Let’s add a mean_life_exp
column to our data frame for each continent
# create a mean_life_exp column containing the average lifeExp value for each continent
|>
gapminder group_by(continent) |>
mutate(mean_life_exp = mean(lifeExp))
# A tibble: 1,704 × 7
# Groups: continent [5]
country continent year lifeExp pop gdpPercap mean_life_exp
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 60.1
2 Afghanistan Asia 1957 30.3 9240934 821. 60.1
3 Afghanistan Asia 1962 32.0 10267083 853. 60.1
4 Afghanistan Asia 1967 34.0 11537966 836. 60.1
5 Afghanistan Asia 1972 36.1 13079460 740. 60.1
6 Afghanistan Asia 1977 38.4 14880372 786. 60.1
7 Afghanistan Asia 1982 39.9 12881816 978. 60.1
8 Afghanistan Asia 1987 40.8 13867957 852. 60.1
9 Afghanistan Asia 1992 41.7 16317921 649. 60.1
10 Afghanistan Asia 1997 41.8 22227415 635. 60.1
# ℹ 1,694 more rows
It’s really important to remember to ungroup()
your data frame after you have completed your group_by()
operation.
# try to summarize the mean gdpPercap value without ungrouping. Then ungroup!
|>
gapminder group_by(continent) |>
mutate(mean_life_exp = mean(lifeExp)) |>
ungroup() |>
summarize(mean(gdpPercap))
# A tibble: 1 × 1
`mean(gdpPercap)`
<dbl>
1 7215.
Exercise
Filter the gapminder
object to the countries in Asia after (and including) the year
2000, then compute the average lifeExp
for each country.
|>
gapminder filter(continent == "Asia", year >= 2000) |>
group_by(country) |>
summarize(mean_life_exp = mean(lifeExp))
# A tibble: 33 × 2
country mean_life_exp
<chr> <dbl>
1 Afghanistan 43.0
2 Bahrain 75.2
3 Bangladesh 63.0
4 Cambodia 58.2
5 China 72.5
6 Hong Kong, China 81.9
7 India 63.8
8 Indonesia 69.6
9 Iran 70.2
10 Iraq 58.3
# ℹ 23 more rows
Group_by multiple columns simultaneously
Let’s compute the average lifeExp for each continent-year combination
# compute the average life expectancy for each continent-year combination
|>
gapminder group_by(continent, year) |>
summarize(mean_life_exp = mean(lifeExp))
`summarise()` has grouped output by 'continent'. You can override using the
`.groups` argument.
# A tibble: 60 × 3
# Groups: continent [5]
continent year mean_life_exp
<chr> <dbl> <dbl>
1 Africa 1952 39.1
2 Africa 1957 41.3
3 Africa 1962 43.3
4 Africa 1967 45.3
5 Africa 1972 47.5
6 Africa 1977 49.6
7 Africa 1982 51.6
8 Africa 1987 53.3
9 Africa 1992 53.6
10 Africa 1997 53.6
# ℹ 50 more rows
Count
A really handy function for summarizing categorical (character/factor) variables is the count()
function.
# count the number of times each continent appears
|>
gapminder count(continent)
# A tibble: 5 × 2
continent n
<chr> <int>
1 Africa 624
2 Americas 300
3 Asia 396
4 Europe 360
5 Oceania 24
Arrange
You can arrange the rows of your data frame in ascending or descending order of the values of a column in your data using the arrange()
function.
# arrange the rows of gapminder in ascending order of lifeExp
|>
gapminder arrange(lifeExp)
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Rwanda Africa 1992 23.6 7290203 737.
2 Afghanistan Asia 1952 28.8 8425333 779.
3 Gambia Africa 1952 30 284320 485.
4 Angola Africa 1952 30.0 4232095 3521.
5 Sierra Leone Africa 1952 30.3 2143249 880.
6 Afghanistan Asia 1957 30.3 9240934 821.
7 Cambodia Asia 1977 31.2 6978607 525.
8 Mozambique Africa 1952 31.3 6446316 469.
9 Sierra Leone Africa 1957 31.6 2295678 1004.
10 Burkina Faso Africa 1952 32.0 4469979 543.
# ℹ 1,694 more rows
# arrange the rows of gapminder in descending order of lifeExp
|>
gapminder arrange(desc(lifeExp))
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Japan Asia 2007 82.6 127467972 31656.
2 Hong Kong, China Asia 2007 82.2 6980412 39725.
3 Japan Asia 2002 82 127065841 28605.
4 Iceland Europe 2007 81.8 301931 36181.
5 Switzerland Europe 2007 81.7 7554661 37506.
6 Hong Kong, China Asia 2002 81.5 6762476 30209.
7 Australia Oceania 2007 81.2 20434176 34435.
8 Spain Europe 2007 80.9 40448191 28821.
9 Sweden Europe 2007 80.9 9031088 33860.
10 Israel Asia 2007 80.7 6426679 25523.
# ℹ 1,694 more rows
Exercise
Use summarize()
to compute the median lifeExp for each country, and then arrange the countries in descending order of maximum pop
value.
|>
gapminder group_by(country) |>
summarize(mean_life_exp = median(lifeExp),
max_pop = max(pop)) |>
arrange(desc(max_pop))
# A tibble: 142 × 3
country mean_life_exp max_pop
<chr> <dbl> <dbl>
1 China 64.7 1318683096
2 India 55.4 1110396331
3 United States 74.0 301139947
4 Indonesia 54.4 223547000
5 Brazil 62.4 190010647
6 Pakistan 55.1 169270617
7 Bangladesh 48.5 150448339
8 Nigeria 45.2 135031164
9 Japan 76.2 127467972
10 Mexico 66.2 108700891
# ℹ 132 more rows