<- read.csv("data/gapminder.csv") gapminder
Data frames with base R
Working with data frames using “Base R”
Let’s load the gapminder dataset
To extract individual entries from a data frame using base R
# extract the entry in the 3rd row and 4th column
3, 4] gapminder[
[1] 31.997
# extract the entries in the 3rd and 4th rows and 4th column
c(3, 4), 4] gapminder[
[1] 31.997 34.020
# extract the entry in the 3rd and 4th rows and 4th and 5th columns
c(3, 4), c(4, 5)] gapminder[
lifeExp pop
3 31.997 10267083
4 34.020 11537966
What type of object are these?
Extracting entire columns from a data frame
There are many ways to extract a single column from a data frame:
# Extract the 4th column
head(gapminder[, 4])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
# extract the lifeExp column (using [,] syntax)
head(gapminder[, "lifeExp"])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
# extract the lifeExp column (using $ syntax)
head(gapminder$lifeExp)
[1] 28.801 30.332 31.997 34.020 36.088 38.438
What type of object are these?
What do you think the output of the following code will be
head(gapminder[3])
year
1 1952
2 1957
3 1962
4 1967
5 1972
6 1977
head(gapminder["year"])
year
1 1952
2 1957
3 1962
4 1967
5 1972
6 1977
A data frame can be thought of as a collection (technically a “list”) of vectors, so the third entry, is the third vector.
Notice the difference in the output between these two ways of extracting the third column:
head(gapminder[, 3])
[1] 1952 1957 1962 1967 1972 1977
head(gapminder[3])
year
1 1952
2 1957
3 1962
4 1967
5 1972
6 1977
To extract the third vector/column directly, you can use double square parentheses [[]]
. This is actually list notation.
# extract the third column with `[[]]` using both numbered indexing and named indexing
head(gapminder[[3]])
[1] 1952 1957 1962 1967 1972 1977
head(gapminder[["year"]])
[1] 1952 1957 1962 1967 1972 1977
Exercise
- Extract the
gdpPercap
entry for the fourth and fifth rows
c(4, 5), "gdpPercap"] gapminder[
[1] 836.1971 739.9811
c(4, 5), 6] gapminder[
[1] 836.1971 739.9811
c(4, 5), ncol(gapminder)] gapminder[
[1] 836.1971 739.9811
- Extract the entire
lifeExp
column in as many different ways as you can (you may want to just look at the head() of your outputs).
# 7 ways of extracting the lifeExp column
head(gapminder[, 4])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
head(gapminder[, "lifeExp"])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
head(gapminder$lifeExp)
[1] 28.801 30.332 31.997 34.020 36.088 38.438
head(gapminder[4])
lifeExp
1 28.801
2 30.332
3 31.997
4 34.020
5 36.088
6 38.438
head(gapminder["lifeExp"])
lifeExp
1 28.801
2 30.332
3 31.997
4 34.020
5 36.088
6 38.438
head(gapminder[[4]])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
head(gapminder[["lifeExp"]])
[1] 28.801 30.332 31.997 34.020 36.088 38.438
Using logical indexing
Let’s create a logical vector, called is_aus
that is TRUE
when country is “Australia” and FALSE
otherwise.
# create vector is_aus
<- gapminder$country == "Australia"
is_aus # test that is_aus contains at least some TRUE values
head(is_aus, 100)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE
sum(is_aus)
[1] 12
Use is_aus
to filter to just the rows for Australia.
# use is_aus to filter to just the rows for Australia
gapminder[is_aus, ]
country continent year lifeExp pop gdpPercap
61 Australia Oceania 1952 69.120 8691212 10039.60
62 Australia Oceania 1957 70.330 9712569 10949.65
63 Australia Oceania 1962 70.930 10794968 12217.23
64 Australia Oceania 1967 71.100 11872264 14526.12
65 Australia Oceania 1972 71.930 13177000 16788.63
66 Australia Oceania 1977 73.490 14074100 18334.20
67 Australia Oceania 1982 74.740 15184200 19477.01
68 Australia Oceania 1987 76.320 16257249 21888.89
69 Australia Oceania 1992 77.560 17481977 23424.77
70 Australia Oceania 1997 78.830 18565243 26997.94
71 Australia Oceania 2002 80.370 19546792 30687.75
72 Australia Oceania 2007 81.235 20434176 34435.37
Removing columns using negative indexing
You can use negative indexing to remove columns
# remove the third column from gapminder (don't overwrite gapminder)
head(gapminder[-3])
country continent lifeExp pop gdpPercap
1 Afghanistan Asia 28.801 8425333 779.4453
2 Afghanistan Asia 30.332 9240934 820.8530
3 Afghanistan Asia 31.997 10267083 853.1007
4 Afghanistan Asia 34.020 11537966 836.1971
5 Afghanistan Asia 36.088 13079460 739.9811
6 Afghanistan Asia 38.438 14880372 786.1134
# if you wanted to update the gapminder dataset:
# gapminder <- gapminder[-3]
Adding columns
You can also use the above syntaxes to add new columns
# create a copy of gapminder called gapminder_tmp
<- gapminder
gapminder_tmp # add a new column to gapminder_tmp called gap, which is the product of pop and gdpPercap
$gdp <- gapminder_tmp$gdpPercap * gapminder_tmp$pop
gapminder_tmp# look at the head of gapminder
head(gapminder_tmp)
country continent year lifeExp pop gdpPercap gdp
1 Afghanistan Asia 1952 28.801 8425333 779.4453 6567086330
2 Afghanistan Asia 1957 30.332 9240934 820.8530 7585448670
3 Afghanistan Asia 1962 31.997 10267083 853.1007 8758855797
4 Afghanistan Asia 1967 34.020 11537966 836.1971 9648014150
5 Afghanistan Asia 1972 36.088 13079460 739.9811 9678553274
6 Afghanistan Asia 1977 38.438 14880372 786.1134 11697659231
Exercise
Modify the lifeExp
column of gapminder_tmp
so that it is rounded to the nearest integer (use round()
).
Challenge: do this JUST for the Australia rows. Check your output for the just the country and lifeExp columns for the first 100 rows
Hint: to undo any changes to gapminder_tmp
, reassign it to the original gapminder object: gapminder_tmp <- gapminder
# for all rows:
$lifeExp <- round(gapminder_tmp$lifeExp)
gapminder_tmphead(gapminder_tmp)
country continent year lifeExp pop gdpPercap gdp
1 Afghanistan Asia 1952 29 8425333 779.4453 6567086330
2 Afghanistan Asia 1957 30 9240934 820.8530 7585448670
3 Afghanistan Asia 1962 32 10267083 853.1007 8758855797
4 Afghanistan Asia 1967 34 11537966 836.1971 9648014150
5 Afghanistan Asia 1972 36 13079460 739.9811 9678553274
6 Afghanistan Asia 1977 38 14880372 786.1134 11697659231
# reset gapminder_tmp
<- gapminder
gapminder_tmp == "Australia", ]$lifeExp <- round(gapminder_tmp[gapminder_tmp == "Australia", ]$lifeExp)
gapminder_tmp[gapminder_tmp # look at just the country and lifeExp columns for the first 100 rows
head(gapminder_tmp, 100)[, c("country", "lifeExp")]
country lifeExp
1 Afghanistan 28.801
2 Afghanistan 30.332
3 Afghanistan 31.997
4 Afghanistan 34.020
5 Afghanistan 36.088
6 Afghanistan 38.438
7 Afghanistan 39.854
8 Afghanistan 40.822
9 Afghanistan 41.674
10 Afghanistan 41.763
11 Afghanistan 42.129
12 Afghanistan 43.828
13 Albania 55.230
14 Albania 59.280
15 Albania 64.820
16 Albania 66.220
17 Albania 67.690
18 Albania 68.930
19 Albania 70.420
20 Albania 72.000
21 Albania 71.581
22 Albania 72.950
23 Albania 75.651
24 Albania 76.423
25 Algeria 43.077
26 Algeria 45.685
27 Algeria 48.303
28 Algeria 51.407
29 Algeria 54.518
30 Algeria 58.014
31 Algeria 61.368
32 Algeria 65.799
33 Algeria 67.744
34 Algeria 69.152
35 Algeria 70.994
36 Algeria 72.301
37 Angola 30.015
38 Angola 31.999
39 Angola 34.000
40 Angola 35.985
41 Angola 37.928
42 Angola 39.483
43 Angola 39.942
44 Angola 39.906
45 Angola 40.647
46 Angola 40.963
47 Angola 41.003
48 Angola 42.731
49 Argentina 62.485
50 Argentina 64.399
51 Argentina 65.142
52 Argentina 65.634
53 Argentina 67.065
54 Argentina 68.481
55 Argentina 69.942
56 Argentina 70.774
57 Argentina 71.868
58 Argentina 73.275
59 Argentina 74.340
60 Argentina 75.320
61 Australia 69.000
62 Australia 70.000
63 Australia 71.000
64 Australia 71.000
65 Australia 72.000
66 Australia 73.000
67 Australia 75.000
68 Australia 76.000
69 Australia 78.000
70 Australia 79.000
71 Australia 80.000
72 Australia 81.000
73 Austria 66.800
74 Austria 67.480
75 Austria 69.540
76 Austria 70.140
77 Austria 70.630
78 Austria 72.170
79 Austria 73.180
80 Austria 74.940
81 Austria 76.040
82 Austria 77.510
83 Austria 78.980
84 Austria 79.829
85 Bahrain 50.939
86 Bahrain 53.832
87 Bahrain 56.923
88 Bahrain 59.923
89 Bahrain 63.300
90 Bahrain 65.593
91 Bahrain 69.052
92 Bahrain 70.750
93 Bahrain 72.601
94 Bahrain 73.925
95 Bahrain 74.795
96 Bahrain 75.635
97 Bangladesh 37.484
98 Bangladesh 39.348
99 Bangladesh 41.216
100 Bangladesh 43.453