Introduction to Data Science with R 102-104

# Instructor
# Mc Namara Chiwaye
# Data Scientist
# Install packages

# install.packages(“readr”)
# install.packages(c(“here”, “dplyr”, “ggplot2”))
# install.packages(c(“here”, “dplyr”, “ggplot2”, “readr”), dependencies = “Depends”)

# install.packages(“Rcpp”)
# install.packages(c(“readr”, “dplyr”))

# Load our packages —-

# Install packages —-

# install.packages(“readr”)
# install.packages(c(“here”, “dplyr”, “ggplot2”))
# install.packages(c(“here”, “dplyr”, “ggplot2”, “readr”), dependencies = “Depends”)
library(here)
library(readr) # for importing data
library(dplyr) # for manipulating the data
library(ggplot2) # for data visualization

# Import data and store it inside variables —-

# install.packages(“Rcpp”)
# install.packages(c(“readr”, “dplyr”))

# Load our packages —-

library(here)
library(readr) # for importing data
library(dplyr) # for manipulating the data
library(ggplot2) # for data visualization

# Import data and store it inside variables —-

flights <- read_csv(“data/raw/flights.csv”)
airports <- read_csv(“data/raw/airports.csv”)
airlines <- read_csv(“data/raw/airlines.csv”)
planes <- read_csv(“data/raw/planes.csv”)
weather <- read_csv(“data/raw/weather.csv”) # dbl – double – numeric data # dttm – date-time – date and time data # chr – character – text data # Basic data exploration —- nrow(airports) # Number of rows ncol(flights) # Number of columns dim(planes) # Number of dimensions glimpse(weather) # Gives you a detailed overview of the dataset (inside the dplyr package) summary(flights) # Distribution (summary statistics) of the columns in the data View(flights) # Data manipulation with the dplyr package —- # > filter: for filtering the data —-

# Flights to Dallas
filter(flights, dest == “DFW”)

# Flights that lasted more than 1 hour and 30 mins
filter(flights, air_time > 90)

# Flights that lasted less than 45 minutes
filter(flights, air_time < 45)

# Planes with less than or equal to 10 seats
filter(planes, seats <= 10) # Planes with more than 100 seats built by PIPER filter(planes, seats > 100, manufacturer == “PIPER”)
filter(planes, manufacturer == “PIPER”)

# Planes built after 2000 by BOEING with more than 150 seats
filter(planes, year >= 2000, manufacturer == “BOEING”, seats > 150)

# All summer flights (between June and August) that departed from JFK and that lasted at least 3 hours.
filter(flights, month >= 6, month <= 8, origin == “JFK”, air_time >= 180)

# All flights except the flights that happened in the month of January
filter(flights, month != 1)
View(filter(flights, month != 1))
flights_no_jan <- filter(flights, month != 1) View(flights_no_jan) # All flights operated by UA or AA filter(flights, carrier == “UA”, carrier == “AA”) # This is wrong! No flight can be operated by two carriers at the same time # We need to use the OR operator: | or the IN operator: %in% filter(flights, carrier == “UA” | carrier == “AA”) filter(flights, carrier %in% c(“UA”, “AA”)) # All summer flights (between June and August) that departed from JFK and that lasted at least 3 hours (with the %in% operator). filter(flights, month %in% c(6, 7, 8), origin == “JFK”, air_time >= 180)

# Weather conditions around Newark or LaGuardia in the winter months (december to february)
# when the temperature is below 40 degree and the wind speed is at least 15 miles per hour
filter(weather, origin %in% c(“EWR”, “LGA”), month %in% c(1, 2, 12), temp < 40, wind_speed >= 15)
filter(weather, origin == “EWR” | origin == “LGA”, month == 1 | month == 2 | month == 12, temp < 40, wind_speed >= 15)

# The AND operator is: &

# All airports with an altitude NOT greater 100
filter(airports, !alt > 100)

# All flights that did not depart from either JFK or EWR
View(filter(flights, !origin %in% c(“JFK”, “EWR”)))

# Calculate mean/average temperature in the month of January (Question from Kinley the chat)
weather %>%
filter(month == 1) %>%
summarize(mean_temp = mean(temp))

# > arrange: for rearranging the data —-

# Reorder the planes dataset from smallest to highest number of seats
planes_seats <- arrange(planes, seats) View(planes_seats) # Reorder the planes dataset from highest to smallest number of seats arrange(planes, -seats) arrange(planes, desc(seats)) # Reorder the planes dataset from oldest to newest plane arrange(planes, year) # Reorder the planes dataset from newest to oldest plane arrange(planes, -year) arrange(planes, desc(year)) # Reorder the airports dataset by altitude in ascending order arrange(airports, alt) # Reorder the airports dataset by altitude in descending order arrange(airports, -alt) arrange(airports, desc(alt)) # Reorder the airports dataset by airport in reverse alphabetical order arrange(airports, desc(name)) # Reorder the airlines dataset by carrier code in alphabetical order arrange(airlines, carrier) # Reorder the airlines dataset by carrier code in reverse alphabetical order arrange(airlines, desc(carrier)) # Reorder the flights dataset by scheduled departure time in chronological order arrange(flights, sched_dep_time) # Reorder the flights dataset by scheduled departure time in reverse chronological order arrange(flights, desc(sched_dep_time)) # Reorder the planes by year and seats arrange(planes, year, seats) arrange(planes, seats, year) # Reorder the flights dataset by carrier and scheduled arrival time in reverse chronological order arrange(flights, carrier, desc(sched_arr_time)) # > filter + arrange —-

# Which flights operated by United Airlines (UA) in the month of December arrived in Dallas the earliest?
ua_dfw_12 <- filter(flights, carrier == “UA”, month == 12, dest == “DFW”)
arrange(ua_dfw_12, arr_time)

arrange(filter(flights, carrier == “UA”, month == 12, dest == “DFW”), arr_time)

# Which carrier between AA and DL had the slowest flight departing from JFK on March 31?
ua_aa_jfk <- filter(flights, carrier %in% c(“AA”, “UA”), origin == “JFK”, month == 3, day == 31)
arrange(ua_aa_jfk, desc(air_time))

# What is the highest wind speed around LaGuardia (LGA) in the first week of September when the temperature is above 75 degrees?
lga_9 <- filter(weather, origin == “LGA”, month == 9, day <= 7, temp > 75)
arrange(lga_9, desc(wind_speed))

# What is the highest wind speed in the weather dataset
arrange(weather, desc(wind_speed))

# > mutate: for creating new data —-

# Convert air_time in the flights dataset to hours
mutate(flights, air_time = air_time / 60)
mutate(flights, air_time_hours = air_time / 60)

# Convert distance to kilometers
mutate(flights, distance = distance * 1.609344)
mutate(flights, distance_km = distance * 1.609344)

# Compute departure delay
flights2 <- mutate(flights, dep_delay = as.numeric(dep_time – sched_dep_time) / 60)
View(flights2)

# Compute arrival delay
flights3 <- mutate(flights, arr_delay = as.numeric(arr_time – sched_arr_time) / 60)

# Compute several delay metrics
flights4 <- mutate(flights, dep_delay = as.numeric(dep_time – sched_dep_time) / 60, arr_delay = as.numeric(arr_time – sched_arr_time) / 60, sched_air_time = as.numeric(sched_arr_time – sched_dep_time) / 60, air_time_made_up = sched_air_time – air_time) flights4[, “air_time_made_up”] # > filter + arrange + mutate —-

# Which flight to LAX operated by American Airlines had the largest departure delay in the month of May?
aa_lax <- filter(flights, carrier == “AA”, dest == “LAX”, month == 5)
aa_lax2 <- mutate(aa_lax, dep_delay = as.numeric(dep_time – sched_dep_time) / 60)
arrange(aa_lax2, desc(dep_delay))

# Which flight departing from LGA and operated by UA in the first half of the year was the shortest in hours?
ua_lga <- filter(flights, origin == “LGA”, carrier == “UA”, month <= 6)
ua_lga2 <- mutate(ua_lga, air_time = air_time / 60)
arrange(ua_lga2, air_time)

# Which flight departing from Newark had the largest total delay? (total delay = departure delay + arrival delay)
ewr_data <- filter(flights, origin == “EWR”)
ewr_data2 <- mutate(ewr_data,
dep_delay = as.numeric(dep_time – sched_dep_time) / 60,
arr_delay = as.numeric(arr_time – sched_arr_time) / 60,
total_delay = dep_delay + arr_delay)
ewr_data3 <- arrange(ewr_data2, desc(total_delay)) ewr_data3[, “total_delay”] # ~ The pipe operator: %>% —-

# Which flight to LAX operated by American Airlines had the largest departure delay in the month of May?
aa_lax_pipe <- flights %>%
filter(carrier == “AA”, dest == “LAX”, month == 5) %>%
mutate(dep_delay = as.numeric(dep_time – sched_dep_time) / 60) %>%
arrange(desc(dep_delay))

# Which flight departing from LGA and operated by UA in the first half of the year was the shortest in hours?
ua_lga <- filter(flights, origin == “LGA”, carrier == “UA”, month <= 6)
ua_lga2 <- mutate(ua_lga, air_time = air_time / 60)
arrange(ua_lga2, air_time)

ua_lga_pipe <- flights %>%
filter(origin == “LGA”, carrier == “UA”, month <= 6) %>%
mutate(air_time = air_time / 60) %>%
arrange(air_time)

# Which flight departing from Newark had the largest total delay? (total delay = departure delay + arrival delay)
ewr_data <- filter(flights, origin == “EWR”)
ewr_data2 <- mutate(ewr_data,
dep_delay = as.numeric(dep_time – sched_dep_time) / 60,
arr_delay = as.numeric(arr_time – sched_arr_time) / 60,
total_delay = dep_delay + arr_delay)
ewr_data3 <- arrange(ewr_data2, desc(total_delay)) flights %>%
filter(origin == “EWR”) %>%
mutate(dep_delay = as.numeric(dep_time – sched_dep_time) / 60,
arr_delay = as.numeric(arr_time – sched_arr_time) / 60,
total_delay = dep_delay + arr_delay) %>%
arrange(desc(total_delay))

# All planes that were manufactured before 1990
filter(planes, year < 1990) planes %>%
filter(year < 1990) # ~ A few shortcuts in RStudio # Ctrl/Cmd + Enter: Run code and move pointer to the next line # Alt + Enter: Run code and leave pointer on the same line # Ctrl/Cmd + L: Clear console # Ctrl/Cmd + Shift + C: Comment/uncomment text # Ctrl/Cmd + I: Format code # Ctrl/Cmd + Shift + R: New code section # Ctrl + 1: Move to script # Ctrl + 2: Move to console # Ctrl/Cmd + Shift + M: pipe operator (%>%)

# > summarize: for summarizing data —-

# Compute the total number of seats of all planes
planes %>%
summarize(total_seats = sum(seats))

# Average temperature in NYC (all 3 weather stations)
weather %>%
summarize(avg_temp = mean(temp, na.rm = TRUE))

# Highest humidity level in NYC (all 3 weather stations)
weather %>%
summarize(max_humid = max(humid, na.rm = TRUE))

# Lowest and highest airports and the height between them
airports %>%
summarize(max_alt = max(alt, na.rm = TRUE),
min_alt = min(alt, na.rm = TRUE)) %>%
mutate(height = max_alt – min_alt,
height_m = height * 0.3048)

# Useful summary functions: sum(), prod(), min(), max(), mean(), median(), var(), sd()

# Lesson 2 —————————————————————-

# > ungroup(): for ungrouping variables (opposite of group_by())—-

# Compute the flight count and percentage of each carrier out of each airport

carrier_origin <- flights %>%
group_by(carrier, origin) %>%
summarize(n_flights = n()) %>%
mutate(pct_flights = 100 * n_flights / sum(n_flights))

carrier_origin %>% # already grouped by carrier
summarize(total_pct = sum(pct_flights))

carrier_origin %>%
arrange(carrier, desc(n_flights))

# Compute the flight and percentage of all flights out of each airport operated by each carrier

flights %>%
group_by(origin, carrier) %>%
summarize(n_flights = n()) %>%
mutate(pct_flights = 100 * n_flights / sum(n_flights))

# Compute count and overall percentages of flights operated by each carrier out of each airport

flights %>%
group_by(carrier, origin) %>%
summarize(n_flights = n()) %>%
ungroup() %>%
mutate(pct_flights = 100 * n_flights / sum(n_flights)) %>%
arrange(desc(pct_flights))

# Compute the flight and overall percentage of all flights out of each airport operated by each carrier

flights %>%
group_by(origin, carrier) %>%
summarize(n_flights = n()) %>%
ungroup() %>%
mutate(pct_flights = 100 * n_flights / sum(n_flights)) %>%
arrange(desc(pct_flights))

# > select(): for selecting specific columns —-

# Select specific columns
flights %>%
select(year, month, day)

# Specify columns we do not want
flights %>%
select(-arr_time, -carrier)

# Select a sequence of columns
flights %>%
select(dep_time:origin)

# Discard a sequence of columns
flights %>%
select(-(dep_time:origin))

# Select columns that contain a specific string of characters
flights %>%
select(contains(“time”))

flights %>%
select(contains(“time”), -air_time)

flights %>%
select(contains(“time”), dest:distance)

flights %>%
select(matches(“time”))

# Select columns that start/end with a specific string of characters
flights %>%
select(starts_with(“dep”))

flights %>%
select(starts_with(“arr”))

flights %>%
select(ends_with(“time”))

flights %>%
select(ends_with(“e”))

# The everything() function
flights %>%
select(origin, air_time, distance, everything())

flights %>%
select(origin, distance, everything(), -contains(“time”))

# Dont do this
flights %>%
select(origin, distance, -contains(“time”), everything())
# > case_when():

# Split flight distance into 4 categories
flights %>%
select(distance) %>%
mutate(distance_label = case_when(
distance <= 500 ~ “close”, distance > 500 & distance <= 2000 ~ “fairly close”, distance > 2000 & distance <= 5000 ~ “far”, distance > 5000 ~ “very far”)) %>%
count(distance_label)

# Split temperature into 4 categories
weather %>%
select(temp) %>%
mutate(cold_level = case_when(
temp < 30 ~ “very cold”, temp >= 30 & temp < 50 ~ “cold”, temp >= 50 & temp < 65 ~ “bearable”, TRUE ~ “Others” )) %>%
group_by(cold_level) %>%
summarize(n = n())

# Split temperature into 2 categories (we can use the ifelse function also)
weather %>%
select(temp) %>%
mutate(cold_level = case_when(
temp >= 70 & temp <= 85 ~ “life is good”, TRUE ~ “life’s not so good” )) weather %>%
select(temp) %>%
mutate(cold_level = ifelse(temp >= 70 & temp <= 85, “life is good”, “life’s not so good”)) # The conditions here are based on 2 variables weather %>%
select(temp, wind_speed) %>%
mutate(
cold_status = ifelse(temp < 50, “cold”, “not cold”), wind_status = ifelse(wind_speed > 16, “windy”, “not windy”),
state_of_life = case_when(
cold_status == “not cold” & wind_status == “not windy” ~ “life’s good”,
cold_status == “cold” & wind_status == “windy” ~ “life’s not good”,
TRUE ~ “it’s okay”
)
)

# > Practicing with joins

left_join(flights, planes, by = “tailnum”) %>%
summarise(total_passengers = sum(seats, na.rm = TRUE))

# Which carrier transported the most passengers in December?

flights %>%
select(month, carrier, tailnum) %>%
filter(month == 12) %>%
left_join(planes, by = “tailnum”) %>%
count(carrier, wt = seats, sort = TRUE, name = “n_passengers”)

# Which planes were not used to fly people out of NYC?

anti_join(planes, flights, by = “tailnum”)

# Were there illegal flights out of NYC?
anti_join(flights, planes, by = “tailnum”) %>%
select(carrier, tailnum) %>%
distinct(carrier, tailnum) %>%
count(carrier, sort = TRUE)

# Certification Course DataCamp ——————————————-

#DATA MANIPULATION WITH DPLYR
#Import dataset
#Counties dataset
library(dplyr)

counties <- readRDS(“C:/Users/User/Desktop/INTERMIDIATE R/Data Manipulation with Dplyr/counties.rds”) # Inspect dim(counties) nrow(counties) ncol(counties) glimpse(counties) #Select four columns namely state, county, population, unemployment # Some important shortcut # run highlight the code or ctrl + Enter # %>% ctrl + shift + m
# hash skip line
# assignment <- alt + – #select, arrange, summarize, groupby, mutate select(counties, state, county, population, unemployment) # Alternatively counties %>%
select(state, county, population, unemployment)

counties %>% select(population, metro, everything())# select helper function
counties %>% select(contains(‘pop’))
counties %>% select(starts_with(‘un’))
counties %>% select(ends_with(‘ent’))

#Creating a new table
counties_selected <- counties %>%
select(state, county, population, unemployment)

counties_selected

View(counties_selected)

#Arrange
counties_selected %>%
arrange(population)

#Arrange: descending
counties_selected %>%
arrange(desc(population)) # desc highest value

#Filter
filter(counties, state ==’Texas’)

# Filter and arrange
counties_selected %>%
arrange(desc(population)) %>%
filter(state == “New York”)

counties_selected %>%
arrange(desc(population)) %>%
filter(unemployment < 6) #Combining conditions counties_selected %>%
arrange(desc(population)) %>%
filter(state == “New York”,
unemployment < 6)

#Mutate
counties_selected <- counties %>%
select(state, county, population, unemployment)
counties_selected

#Total number of unemployed people
population * unemployment / 100

#Mutate
counties_selected %>%
mutate(unemployed_population = population * unemployment / 100)

counties_selected %>%
mutate(unemployed_population = population * unemployment / 100) %>%
arrange(desc(unemployed_population))

#Aggregate within groups
counties %>%
group_by(state) %>%
summarize(total_pop = sum(population),
average_unemployment = sum(unemployment))

#Arrange
counties %>%
group_by(state) %>%
summarize(total_pop = sum(population),
average_unemployment = mean(unemployment)) %>%
arrange(desc(average_unemployment))

#Metro column
counties %>%
select(state, metro, county, population)

#Group by
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population))

#Ungroup
counties %>%
group_by(state, metro) %>%
summarize(total_pop = sum(population)) %>%
ungroup()

# The top_n verb
# top_n
counties_selected <- counties %>%
select(state, county, population, unemployment, income)

# For each state what the county with the highest population
counties_selected %>%
group_by(state) %>%
top_n(n =1,wt = population)

counties_selected %>%
group_by(state) %>%
top_n(2, population)
# Highest unemployment
counties_selected %>%
group_by(state) %>%
top_n(n =1, wt = unemployment)

# Number of observations
counties_selected %>%
group_by(state) %>%
top_n(n = 3, wt =unemployment)

counties_selected %>%
group_by(state) %>%
top_n(n = 3, wt =income)

# CHAPTER 3
# Select
counties %>%
select(state, county, population, unemployment)

# Select a range
counties %>%
select(state, county, drive:work_at_home)

# Select and arrange
counties %>%
select(state, county, drive:work_at_home) %>%
arrange(drive)

# Contains
counties %>%
select(state, county, contains(“work”))

# Starts with
counties %>%
select(state, county, starts_with(“income”))

# Other helpers
# contains()
# starts_with()
# ends_with()
# last_col()

# For more:
# ?select_helpers

# Removing a variable
counties %>%
select(-census_id)
# A tibble: 3,138 x 39

# The rename verb
#Select columns
counties_selected <- counties %>%
select(state, county, population, unemployment)
counties_selected

# Rename a column
counties_selected %>%
rename(unemployment_rate = unemployment)

#Combine verbs
counties_selected %>%
select(state, county, population, unemployment_rate = unemployment)

#Compare verbs
Select
counties %>%
select(state, county, population, unemployment_rate = unemployment)
#Rename
counties %>%
select(state, county, population, unemployment) %>%
rename(unemployment_rate = unemployment)

# The transmute verb

# Transmute
# Combination: select & mutate
# Returns a subset of columns that are transformed and changed

# Select and calculate

counties %>%
transmute(state, county, fraction_men = men/population)

# The babynames data
# import data
babynames <- readRDS(“C:/Users/User/Desktop/INTERMIDIATE R/Data Manipulation with Dplyr/babynames.rds”) #The babynames data babynames # Frequency of a name babynames %>%
filter(name == “Amy”)

#Amy plot
library(ggplot2)
babynames_filtered <- babynames %>%
filter(name == “Amy”)

ggplot(data, aes()) +
geom_line()

ggplot(data =babynames_multiple, aes(x = year, y = number))+ geom_line()

#Filter for multiple names
babynames_multiple <- babynames %>%
filter(name %in% c(“Amy”, “Christopher”))

#When was each name most common?
babynames %>%
group_by(name) %>%
top_n(1,number)

# Grouped mutates
# Review: group_by() and summarize()
babynames %>%
group_by(year) %>%
summarize(year_total = sum(number))

# Combining group_by() and mutate()
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number))

# ungroup()
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup()

# Add the fraction column
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)

# Window functions

v <- c(1, 3, 6, 14)
v

lag(v)
# Compare consecutive steps
v – lag(v)
library(dplyr)
# Changes in popularity of a name
babynames_fraction <- babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total)

# Matthew
babynames_fraction %>%
filter(name == “Matthew”) %>%
arrange(year)

# Matthew over time
babynames_fraction %>%
filter(name == “Matthew”) %>%
arrange(year) %>%
mutate(difference = fraction – lag(fraction))

# Biggest jump in popularity
babynames_fraction %>%
filter(name == “Matthew”) %>%
arrange(year) %>%
mutate(difference = fraction – lag(fraction)) %>%
arrange(desc(difference))

# Changes within every name
babynames_fraction %>%
arrange(name, year) %>%
mutate(difference = fraction – lag(fraction)) %>%
group_by(name) %>%
arrange(desc(difference))

# Congratulations!
# D ATA M A N I P U L AT I O N W I T H D P LY R
#
# Summary
# select()
# filter()
# mutate()
# arrange()
# count()
# group_by()
# summarize()

Print Friendly, PDF & Email

Leave a Reply

Your email address will not be published. Required fields are marked *