rm(list= ls())
library(tidyverse)
options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("nycflights13")
library(nycflights13)
nycflights13::flights
arrange(flights, year, month, day)
#Using desc() to reorder by a column in descending order#
arrange(flights, desc(arr_delay))
?select
#select a column by name#
select(flights, year, month, day)
#select all columns between year and day#
select(flights, year:day)
#or except for year to day#
select(flights, -(year:day))
#rename a variable#
rename(flights,tail_num = tailnum)
#Add new variable by Mutate() function#
flights <- data.frame(flights)
head(flights)
flightsml <- select(flights, year:day, ends_with("delay"), distance, air_time)
head(flightsml)
mutate(flightsml,delay = arr_delay - dep_delay, speed = distance/air_time*60)
?flights
#*ends_with: for the variable which ends by letter "delay", such as dep_delay, arr_delay...*#
#Using pipe function : this is a series of imperative statements: group, then summarize, then filter. As suggested by this reading, a good way to pronounce %>% when reading code is “then.”
#This code is to explore the relationship between the distance and average delay for each location.
delays <- flights %>%
group_by(dest) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
delays %>% ggplot(delays, mapping = aes(x = dist, y = delay))+
geom_point(aes(size = count), alpha = 1/3)+
geom_smooth(se = FALSE)
#It looks like delays increase with distance up to ~750 miles and then decrease.
# Na.rm = TRUE
The aggregation functions obey the usual rule of missing values: if there’s any missing value in the input, the output will be a missing value. Fortunately, all aggregation functions have an na.rm argument, which removes the missing values prior to computation.
Blog nhằm nghiên cứu, trao đổi, học tập. Nội dung Blog gồm tập hợp những bài giảng, bài nghiên cứu, hướng dẫn biên soạn từ các giáo trình của các tác giả trong và ngoài nước.
Monday, 18 March 2019
Using dplyr for data visulization (Part1: FILTER)
###Using
package dplyr for data visualization###
rm(list=ls())
options(repos = c(CRAN
= "http://cran.rstudio.com"))
install.packages("nycflights13")
library(nycflights13)
library(tidyverse)
###Using flights
data to practice###
int stands for integers.
dbl stands for doubles, or real numbers.
chr stands for character
vectors, or strings.
dttm stands for date-times
(a date + a time)
lgl stands for logical,
vectors that contain only TRUE or FALSE
fctr stands for factors,
which R uses to represent categorical variables with fixed possible values
date stands for dates
?flights
nycflights13::flights
head(flights, n= 10)
### Using dplyr function to
manipulate the data ###
Filter (): Pick observations by their values
Arrange (): Reorder the rows
Select (): Pick variables by
their names
Mutate(): Create new variables
with functions of existing variables
Summarize (): Collapse many values
down to a single summary
Group_by():changes the scope of each
function from operating on the entire dataset to operating on it group-by-group
Filter:
filter(flights, month
== 1, day == 1)
filter(flights, month
== 2, day == 2)
jan1 <-
filter(flights, month ==1, day ==1)
head(jan1, n =10)
dec25 <- filter(flights,
month == 12, day ==25)
head(dec25, n =9)
###
Comparision: the standard suite: >, >=, <, <=, != (not equal), and
== (equal)###
###Boolean operators:
& is “and,” | is “or,” and ! is “not.”
filter(flights, month
== 11 | month ==12)
### x %in% y:
This will select every row where x is one of the values in y
nov_dec <-
filter(flights, month %in% c(11,12))
### find flights that weren’t
delayed (on arrival or departure) by more than two hours
filter(flights,
!(arr_delay >=120 | dep_delay >=120))
filter(flights,
!(arr_delay >= 45 & dep_delay >=45))
filter(flights,
(arr_delay <=-3 | dep_delay <=-3 ))
Subscribe to:
Comments (Atom)
