# create three vectors containing: age, state, and diabetes status
<- c(29, 35, 36, 21, 42, 39, 52, 35, 30, 44)
age_vec <- c("CA", "FL", "PA", "NY", "UT", "UT", "MT", "CO", "NV", "WY")
state_vec <- c(TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE) diabetes_vec
Loading data
Creating a data set
Let’s create a dataset with three variables
If we only had a little bit of data, we could define a vector for each variable in our data
And we could place these three vectors into a single object called a “data frame”
# create a data frame called patient_data with data.frame
# with three columns: age, state, and diabetes
<- data.frame(age = age_vec,
patient_data state = state_vec,
diabetes = diabetes_vec)
# print out the data frame
patient_data
age state diabetes
1 29 CA TRUE
2 35 FL FALSE
3 36 PA FALSE
4 21 NY TRUE
5 42 UT TRUE
6 39 UT FALSE
7 52 MT FALSE
8 35 CO TRUE
9 30 NV FALSE
10 44 WY FALSE
You can look at a summary by
# use str() to look at the data frame
str(patient_data)
'data.frame': 10 obs. of 3 variables:
$ age : num 29 35 36 21 42 39 52 35 30 44
$ state : chr "CA" "FL" "PA" "NY" ...
$ diabetes: logi TRUE FALSE FALSE TRUE TRUE FALSE ...
# what is the "class" of the data frame?
class(patient_data)
[1] "data.frame"
Each column in a data frame can have a different type, but each entry within a single column must be a single type (because each column corresponds to a vector).
CSV data files
CSV files are one of the simplest data formats.
CSV stands for “comma separated value”. In a CSV file:
Every entry in a row is separated by a comma
New rows are created by starting a new line
Take a look at the data/gapminder.csv
file.
Loading CSV files
To load in a dataset (as a data frame) from a csv file, we can use the read.csv()
function
# load the data/gapminder.csv file
# read.csv(file = "data/gapminder.csv")
# save it as gapminder
<- read.csv(file = "data/gapminder.csv") gapminder
The working directory
If R cannot find your file, you may be in the wrong working directory (the location in your computer where file-paths will start from).
If you opened an R project, then your working directory will be the location of the project folder.
To change the working directory, use the “Session > Set Working Directory” menu.
Let’s take a look at the gapminder data object
# print the gapminder object
# gapminder
Note that it prints out A LOT of data! Try to avoid printing entire datasets in your quarto document. Render your document to see why.
Summarizing a data frame
Instead of looking at the entire data frame, it is often easier to look at just the first few rows using the head()
function:
# use the head() function to look at gapminder
head(gapminder)
country continent year lifeExp pop gdpPercap
1 Afghanistan Asia 1952 28.801 8425333 779.4453
2 Afghanistan Asia 1957 30.332 9240934 820.8530
3 Afghanistan Asia 1962 31.997 10267083 853.1007
4 Afghanistan Asia 1967 34.020 11537966 836.1971
5 Afghanistan Asia 1972 36.088 13079460 739.9811
6 Afghanistan Asia 1977 38.438 14880372 786.1134
# look at the first 20 rows
head(gapminder, n = 20)
country continent year lifeExp pop gdpPercap
1 Afghanistan Asia 1952 28.801 8425333 779.4453
2 Afghanistan Asia 1957 30.332 9240934 820.8530
3 Afghanistan Asia 1962 31.997 10267083 853.1007
4 Afghanistan Asia 1967 34.020 11537966 836.1971
5 Afghanistan Asia 1972 36.088 13079460 739.9811
6 Afghanistan Asia 1977 38.438 14880372 786.1134
7 Afghanistan Asia 1982 39.854 12881816 978.0114
8 Afghanistan Asia 1987 40.822 13867957 852.3959
9 Afghanistan Asia 1992 41.674 16317921 649.3414
10 Afghanistan Asia 1997 41.763 22227415 635.3414
11 Afghanistan Asia 2002 42.129 25268405 726.7341
12 Afghanistan Asia 2007 43.828 31889923 974.5803
13 Albania Europe 1952 55.230 1282697 1601.0561
14 Albania Europe 1957 59.280 1476505 1942.2842
15 Albania Europe 1962 64.820 1728137 2312.8890
16 Albania Europe 1967 66.220 1984060 2760.1969
17 Albania Europe 1972 67.690 2263554 3313.4222
18 Albania Europe 1977 68.930 2509048 3533.0039
19 Albania Europe 1982 70.420 2780097 3630.8807
20 Albania Europe 1987 72.000 3075321 3738.9327
We can print out the column names:
# use colnames() to print out the column names
colnames(gapminder)
[1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
We can ask how many rows and columns my data frame has:
# compute the number of rows (nrow)
nrow(gapminder)
[1] 1704
# compute the number of columns (ncol)
ncol(gapminder)
[1] 6
# look at the dimension of gapminder (dim)
dim(gapminder)
[1] 1704 6
Look at a summary
# use str() to look at a summary of gapminder
str(gapminder)
'data.frame': 1704 obs. of 6 variables:
$ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ continent: chr "Asia" "Asia" "Asia" "Asia" ...
$ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num 28.8 30.3 32 34 36.1 ...
$ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num 779 821 853 836 740 ...
# use summary() to look at a summary of gapminder
summary(gapminder)
country continent year lifeExp
Length:1704 Length:1704 Min. :1952 Min. :23.60
Class :character Class :character 1st Qu.:1966 1st Qu.:48.20
Mode :character Mode :character Median :1980 Median :60.71
Mean :1980 Mean :59.47
3rd Qu.:1993 3rd Qu.:70.85
Max. :2007 Max. :82.60
pop gdpPercap
Min. :6.001e+04 Min. : 241.2
1st Qu.:2.794e+06 1st Qu.: 1202.1
Median :7.024e+06 Median : 3531.8
Mean :2.960e+07 Mean : 7215.3
3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Max. :1.319e+09 Max. :113523.1
Loading Excel data files into R
To load excel files, we need to install the readxl R package
R packages provide you with additional R functions.
You only ever need to install an R package ONCE. This is like installing an application on your computer.
# run in the console: install.packages("readxl")
But every time you want to use an R package in a new R session, you need to load the library using the library() function
# load the readxl R package
library(readxl)
Let’s load the gapminder excel dataset using a function from readxl.
# use read_excel() from readxl to load the data/gapminder.xls file
<- read_excel("data/gapminder.xls") gapminder_excel
Note this will only load the first sheet. You can use the sheet
argument to load other sheets.
# use the "sheet" argument to load in just the second sheet containing Australia's data
<- read_excel("data/gapminder.xls", sheet = 2) gapminder_excel_australia
Exercise
Load the world happiness dataset from the whr_2023.csv
file. Save it as a variable called world_happiness
. Then print out the first 10 rows, the column names, create a summary of the data, report its dimension,
<- read.csv("data/whr_2023.csv") world_happiness
# look at first 10 rows
head(world_happiness, 10)
country_name year life_ladder log_GDP_per_capita social_support
1 Afghanistan 2005 NA NA NA
2 Afghanistan 2006 NA NA NA
3 Afghanistan 2007 NA NA NA
4 Afghanistan 2008 3.724 7.350 0.451
5 Afghanistan 2009 4.402 7.509 0.552
6 Afghanistan 2010 4.758 7.614 0.539
7 Afghanistan 2011 3.832 7.581 0.521
8 Afghanistan 2012 3.783 7.661 0.521
9 Afghanistan 2013 3.572 7.680 0.484
10 Afghanistan 2014 3.131 7.671 0.526
healthy_life_expectancy_at_birth freedom_to_make_life_choices generosity
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 50.5 0.718 0.168
5 50.8 0.679 0.191
6 51.1 0.600 0.121
7 51.4 0.496 0.164
8 51.7 0.531 0.238
9 52.0 0.578 0.063
10 52.3 0.509 0.106
perceptions_of_corruption positive_affect negative_affect
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 0.882 0.414 0.258
5 0.850 0.481 0.237
6 0.707 0.517 0.275
7 0.731 0.480 0.267
8 0.776 0.614 0.268
9 0.823 0.547 0.273
10 0.871 0.492 0.375
# column names
colnames(world_happiness)
[1] "country_name" "year"
[3] "life_ladder" "log_GDP_per_capita"
[5] "social_support" "healthy_life_expectancy_at_birth"
[7] "freedom_to_make_life_choices" "generosity"
[9] "perceptions_of_corruption" "positive_affect"
[11] "negative_affect"
# two possible summaries
str(world_happiness)
'data.frame': 2970 obs. of 11 variables:
$ country_name : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ year : int 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 ...
$ life_ladder : num NA NA NA 3.72 4.4 ...
$ log_GDP_per_capita : num NA NA NA 7.35 7.51 ...
$ social_support : num NA NA NA 0.451 0.552 0.539 0.521 0.521 0.484 0.526 ...
$ healthy_life_expectancy_at_birth: num NA NA NA 50.5 50.8 51.1 51.4 51.7 52 52.3 ...
$ freedom_to_make_life_choices : num NA NA NA 0.718 0.679 0.6 0.496 0.531 0.578 0.509 ...
$ generosity : num NA NA NA 0.168 0.191 0.121 0.164 0.238 0.063 0.106 ...
$ perceptions_of_corruption : num NA NA NA 0.882 0.85 0.707 0.731 0.776 0.823 0.871 ...
$ positive_affect : num NA NA NA 0.414 0.481 0.517 0.48 0.614 0.547 0.492 ...
$ negative_affect : num NA NA NA 0.258 0.237 0.275 0.267 0.268 0.273 0.375 ...
summary(world_happiness)
country_name year life_ladder log_GDP_per_capita
Length:2970 Min. :2005 Min. :1.281 Min. : 5.527
Class :character 1st Qu.:2009 1st Qu.:4.647 1st Qu.: 8.500
Mode :character Median :2014 Median :5.432 Median : 9.499
Mean :2014 Mean :5.479 Mean : 9.390
3rd Qu.:2018 3rd Qu.:6.309 3rd Qu.:10.373
Max. :2022 Max. :8.019 Max. :11.664
NA's :771 NA's :791
social_support healthy_life_expectancy_at_birth freedom_to_make_life_choices
Min. :0.2280 Min. : 6.72 Min. :0.2580
1st Qu.:0.7470 1st Qu.:59.12 1st Qu.:0.6562
Median :0.8360 Median :65.05 Median :0.7700
Mean :0.8107 Mean :63.29 Mean :0.7478
3rd Qu.:0.9050 3rd Qu.:68.50 3rd Qu.:0.8590
Max. :0.9870 Max. :74.47 Max. :0.9850
NA's :784 NA's :825 NA's :804
generosity perceptions_of_corruption positive_affect negative_affect
Min. :-0.3380 Min. :0.0350 Min. :0.1790 Min. :0.0830
1st Qu.:-0.1120 1st Qu.:0.6880 1st Qu.:0.5720 1st Qu.:0.2080
Median :-0.0230 Median :0.8000 Median :0.6630 Median :0.2610
Mean : 0.0001 Mean :0.7452 Mean :0.6521 Mean :0.2715
3rd Qu.: 0.0920 3rd Qu.:0.8690 3rd Qu.:0.7380 3rd Qu.:0.3230
Max. : 0.7030 Max. :0.9830 Max. :0.8840 Max. :0.7050
NA's :844 NA's :887 NA's :795 NA's :787
# the dimension
dim(world_happiness)
[1] 2970 11