rm(list= ls())
library(tidyverse)
options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("nycflights13")
library(nycflights13)
nycflights13::flights
arrange(flights, year, month, day)
#Using desc() to reorder by a column in descending order#
arrange(flights, desc(arr_delay))
?select
#select a column by name#
select(flights, year, month, day)
#select all columns between year and day#
select(flights, year:day)
#or except for year to day#
select(flights, -(year:day))
#rename a variable#
rename(flights,tail_num = tailnum)
#Add new variable by Mutate() function#
flights <- data.frame(flights)
head(flights)
flightsml <- select(flights, year:day, ends_with("delay"), distance, air_time)
head(flightsml)
mutate(flightsml,delay = arr_delay - dep_delay, speed = distance/air_time*60)
?flights
#*ends_with: for the variable which ends by letter "delay", such as dep_delay, arr_delay...*#
#Using pipe function : this is a series of imperative statements: group, then summarize, then filter. As suggested by this reading, a good way to pronounce %>% when reading code is “then.”
#This code is to explore the relationship between the distance and average delay for each location.
delays <- flights %>%
group_by(dest) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
delays %>% ggplot(delays, mapping = aes(x = dist, y = delay))+
geom_point(aes(size = count), alpha = 1/3)+
geom_smooth(se = FALSE)
#It looks like delays increase with distance up to ~750 miles and then decrease.
# Na.rm = TRUE
The aggregation functions obey the usual rule of missing values: if there’s any missing value in the input, the output will be a missing value. Fortunately, all aggregation functions have an na.rm argument, which removes the missing values prior to computation.

No comments:
Post a Comment