# Load libraries ---------------------------------------------------------- library(tidyverse) # This package provides access to the World Bank's World Development Indicators # (WDI) database (https://data.worldbank.org/) library(WDI) # This package provides a helpful countrycode() function that converts country # names ("United States") into country codes ("US" or "USA") library(countrycode) # Load raw data ----------------------------------------------------------- # This is a list of indicators I want to download from the World Bank. The # cryptic names from from the URLs of the different pages at the World Bank's # website. For instance, data for "school enrollment, primary" is available at # https://data.worldbank.org/indicator/SE.PRM.NENR (I found that page by # searching for primary school enrollment at data.worldbank.org). That last part # of the URL (SE.PRM.NENR) is the magic ID code for the variable. indicators <- c("SE.PRM.NENR", # School enrollment, primary (% net) "SP.DYN.LE00.IN", # Life expectancy "EG.ELC.ACCS.ZS", # Access to electricity "SH.DYN.AIDS.ZS", # HIV prevalence "EN.ATM.CO2E.PC", # CO2 emissions "SI.POV.DDAY", # Extreme poverty (% earning less than $2/day) "NY.GDP.PCAP.KD") # GDP per capita # The WDI() function connects to the World Bank's server and downloads data for # all the indicators defined in the indicators list above. I only want data from # 2015 here, so I limit the years with start and end. The extra=TRUE argument # means that it'll also include other helpful details like region, aid status, # etc. Without it, it would only download the indicators we listed. wdi_raw <- WDI(country = "all", indicators, extra = TRUE, start = 2015, end = 2015) # Data from the UN's World Happiness Report is available at Kaggle: # https://www.kaggle.com/unsdsn/world-happiness # You have to download the data onto your computer and load it with read_csv() # If you're using an RStudio project, put it somewhere in your project folder, # like in a subfolder named data happiness_raw <- read_csv("data/2015.csv") # happiness_raw <- read_csv("~/Downloads/2015.csv") # Read directly from my downloads folder # happiness_raw <- read_csv("static/data/2015.csv") # Read from where it is in this website thing # Clean and combine data -------------------------------------------------- # First we clean up the raw World Bank data. It includes rows for regions, like # the Middle East, so we filter those out (they're helpfully marked as # "Aggregates" in the income colum). Then we rename some of the ugly World Bank # codes to actual words wdi_clean <- wdi_raw %>% filter(income != "Aggregates") %>% select(iso2c, country, year, school_enrollment = SE.PRM.NENR, life_expectancy = SP.DYN.LE00.IN, access_to_electricity = EG.ELC.ACCS.ZS, gdp_per_cap = NY.GDP.PCAP.KD, region, income) # Then we clean the happiness data. In order to get the two datasets to combine, # they have to have a shared column. The World Bank data has a column called # "iso2c", which is a standard 2-character code for each country. We can use the # countrycode() function here to convert country names into 2-character codes. # We also remove Kosovo because it doesn't have an official ISO code. Finally we # rename some columns to make them easier to type out happiness_clean <- happiness_raw %>% filter(Country != "Kosovo") %>% mutate(iso2c = countrycode(Country, "country.name", "iso2c")) %>% select(iso2c, happiness_score = `Happiness Score`, happiness_se = `Standard Error`) # Finally we join the two datasets based on the shared ISO column all_data <- happiness_clean %>% right_join(wdi_clean, by = "iso2c") # Save data --------------------------------------------------------------- # Then we save this data frame as a CSV file and we're done! write_csv(all_data, "data/world_happiness.csv") # write_csv(all_data, "~/Desktop/world_happiness.csv") # On my desktop