Basic Data Analytics: 2019

Monday, 16 December 2019

Using Vector AutoRegression Model for Multivariate Time series forecast

rm(list = ls())
#install.packages("vars")
library(dplyr)
library(vars)
library(tidyverse)
library(readxl)
library(urca)
library(car)
#install.packages(foreca)
#install.packages("forecast")
library(forecast)
dev.off()

# A function for black theme:
my_theme <- function(...) {
theme(
axis.line = element_blank(),
axis.text.x = element_text(color = "white", lineheight = 0.9),
axis.text.y = element_text(color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
legend.background = element_rect(color = NA, fill = "gray10"),
legend.key = element_rect(color = "white", fill = "gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(color = "white"),
legend.title = element_text(face = "bold", hjust = 0, color = "white"),
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
panel.background = element_rect(fill = "gray10", color = NA),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(color = "white"),
strip.text.y = element_text(color = "white", angle = -90),
plot.background = element_rect(color = "gray10", fill = "gray10"),
plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25,
margin = margin(2, 2, 2, 2)),
plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),
plot.caption = element_text(color = "white", hjust = 0),
plot.margin = unit(rep(1, 4), "lines"))

}

#Loading data

data <- read.csv("D:/Techcombank/data1.csv", sep = ";", header = TRUE)

data.frame(data)

data %>% head()

data <- na.omit(data)

#plot mutivariate data plot
scatterplotMatrix(data[2:4])

dev.off()
plot(data$avg_bal, data$SLGD)

#calculating Correlations for Multivariate DAta

cor.test(data$avg_bal, data$SLGD)
cor.test(data$avg_bal, data$SLKH)
cor.test(data$SLKH, data$SLGD)

#Building a VAR Model
#Vector Autoregression (VAR) is a multivariate forecasting
#algorithm that is used when two or more time series influence each other

#1.Analyze the time series characteristics
#2.Test for causation amongst the time series
#3.Test for stationarity
#4.Transform the series to make it stationary, if needed
#5.Find optimal order (p)
#6.Prepare training and test datasets
#7.Train the model
#8.Roll back the transformations, if any.
#9.Evaluate the model using test set
#10.Forecast to future

#1.Analyze the time series characteristics

data %>% str()

data$MONTHID <- as.character(data$MONTHID)
data$time <- as.Date(data$Monthid,format = "%m/%d/%Y")

data$log.avgbal <- log(data$avg_bal)
data$log.SLGD <- log(data$SLGD)
data$log.SLHD <- log(data$SLHD)
data$log.SLKH <- log(data$SLKH)

data %>% head()

# Visualize time series:
data %>% ggplot(aes(time, avg_bal, series = "red")) +
geom_line() +
my_theme() +
scale_color_manual(values = c("red", "red")) +
labs(x = NULL, y = NULL,
title = "visualization")

#2.Test for causation amongst the time series
# AUTOMATICALLY SEARCH FOR THE MOST SIGNIFICANT RESULT
for (i in 1:4)
{
cat("LAG =", i)
print(causality(VAR(data, p = i, type = "const"), cause = "AVG_BAL")$Granger)
}

#3.Test for stationarity
##Perform Augmented Dicket Fuller

ur.df(data$avg_bal, type = "none") %>%
summary()

ur.df(data$SLGD, type = "none") %>%
summary()

ur.df(data$SLKH, type = "none") %>%
summary()

#test-statistics more than critical values => non-stationary

###difference

ur.df(diff(data$avg_bal, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLGD, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLHD, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLKH, differences = 2), type = "none") %>%
summary()

dif.avgbal <- diff(data$avg_bal, differences = 2)
dif.SLGD <- diff(data$SLGD, differences = 2)
dif.SLHD <- diff(data$SLHD, differences = 2)
dif.SLKH <- diff(data$SLKH, differences = 2)

df2 <- data.frame(dif.avgbal = dif.avgbal, dif.SLGD = dif.SLGD, dif.SLKH = dif.SLKH)

df2 %>% head()

df2 <- ts(df2, frequency = 12, start = c(2017,3))

df2$Monthid <- as.Date(df2$Month, "%d-%m-%y")
str(df2)

df2 %>% str()

df2$Monthid <- as.Date(df2$Month, format = "%m/%d/%Y")

df2 %>% head()

dev.off()
df2 %>% ggplot(aes(Monthid, dif.SLKH)) +
geom_line()

VARselect(df2$dif.avgbal, lag.max = 5, type = "const")
VARselect(df2$dif.SLGD, lag.max = 5, type = "const")
VARselect(df2$dif.SLHD, lag.max = 5, type = "const")
VARselect(df2$dif.SLKH, lag.max = 5, type = "const")

##Plot ACF
acf(df2$dif.avgbal)
acf(df2$dif.SLGD)
acf(df2$dif.SLHD)
acf(df2$dif.SLKH)

#Var estimate
str(df2)

df <- cbind(df2$Monthid, df2$dif.avgbal, df2$dif.SLGD, df2$dif.SLKH)
df$month <- ts(start = c(2017,3), end = c(2019, 11), frequency = 1)

varp <- VAR(df, p = 2, type = "const", season = 4)
varp

lmp <- varp$varresult
summary(lmp)

#install.packages("stargazer")
library(stargazer)
stargazer(lmp$y1, lmp$y2, lmp$y3, type = "text", dep.var.labels.include = FALSE)

#plot residuals and their ACF and PACF
plot(varp$varresult)

#Granger causility
causality(varp, cause = "y1")
causality(varp, cause = "y2")
causality(varp, cause = "y3")
causality(varp, cause = "y4")

varp.f <- predict(varp, n.ahead = 6, ci = 0.95)
data.frame(varp.f)
par(mar=c(1,1,1,1))
plot(varp.f, names = "y1")
fanchart(varp.f, names = "y4")

varp.f %>% head()
#Generate impulse funtion
irf.avgbal <- irf(varp, impulse = "y1", response = "y4",
n.ahead = 12, boot = TRUE)
plot(irf.avgbal, ylab = "avgbal", main = "Shock from SLGD")

head(varp.f)

#undifference a time series

df3 <- diffinv(df2$dif.avgbal, xi = 0.5)
df3 %>% head()

#Backtest result

Result Accuracy: MAPE 0.0025

Monday, 18 March 2019

Using dplyr for data visulization (Part 2: ARRANGE, SELECT, RENAME, MUTATE)

rm(list= ls())
library(tidyverse)
options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("nycflights13")
library(nycflights13)
nycflights13::flights
arrange(flights, year, month, day)

#Using desc() to reorder by a column in descending order#
arrange(flights, desc(arr_delay))
?select

#select a column by name#
select(flights, year, month, day)
#select all columns between year and day#
select(flights, year:day)
#or except for year to day#
select(flights, -(year:day))

#rename a variable#
rename(flights,tail_num = tailnum)

#Add new variable by Mutate() function#
flights <- data.frame(flights)
head(flights)
flightsml <- select(flights, year:day, ends_with("delay"), distance, air_time)
head(flightsml)
mutate(flightsml,delay = arr_delay - dep_delay, speed = distance/air_time*60)
?flights
#*ends_with: for the variable which ends by letter "delay", such as dep_delay, arr_delay...*#

#Using pipe function : this is a series of imperative statements: group, then summarize, then filter. As suggested by this reading, a good way to pronounce %>% when reading code is “then.”
#This code is to explore the relationship between the distance and average delay for each location.

delays <- flights %>%
group_by(dest) %>%
summarise(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")

delays %>% ggplot(delays, mapping = aes(x = dist, y = delay))+
geom_point(aes(size = count), alpha = 1/3)+
geom_smooth(se = FALSE)
#It looks like delays increase with distance up to ~750 miles and then decrease.

# Na.rm = TRUE
The aggregation functions obey the usual rule of missing values: if there’s any missing value in the input, the output will be a missing value. Fortunately, all aggregation functions have an na.rm argument, which removes the missing values prior to computation.

Using dplyr for data visulization (Part1: FILTER)

###Using package dplyr for data visualization###

rm(list=ls())

options(repos = c(CRAN = "http://cran.rstudio.com"))

install.packages("nycflights13")

library(nycflights13)

library(tidyverse)

###Using flights data to practice###

int stands for integers.

dbl stands for doubles, or real numbers.

chr stands for character vectors, or strings.

dttm stands for date-times (a date + a time)

lgl stands for logical, vectors that contain only TRUE or FALSE

fctr stands for factors, which R uses to represent categorical variables with fixed possible values

date stands for dates

?flights

nycflights13::flights

head(flights, n= 10)

### Using dplyr function to manipulate the data ###

Filter (): Pick observations by their values

Arrange (): Reorder the rows

Select (): Pick variables by their names

Mutate(): Create new variables with functions of existing variables

Summarize (): Collapse many values down to a single summary

Group_by():changes the scope of each function from operating on the entire dataset to operating on it group-by-group

Filter:

filter(flights, month == 1, day == 1)

filter(flights, month == 2, day == 2)

jan1 <- filter(flights, month ==1, day ==1)

head(jan1, n =10)

dec25 <- filter(flights, month == 12, day ==25)

head(dec25, n =9)

### Comparision: the standard suite: >, >=, <, <=, != (not equal), and == (equal)###

###Boolean operators: & is “and,” | is “or,” and ! is “not.”

filter(flights, month == 11 | month ==12)

### x %in% y: This will select every row where x is one of the values in y

nov_dec <- filter(flights, month %in% c(11,12))

### find flights that weren’t delayed (on arrival or departure) by more than two hours

filter(flights, !(arr_delay >=120 | dep_delay >=120))

filter(flights, !(arr_delay >= 45 & dep_delay >=45))

filter(flights, (arr_delay <=-3 | dep_delay <=-3 ))

Sunday, 17 March 2019

Some basic code for Stock Analysis + Value at Risk (VaR)

rm(list = ls())

setwd("D:/R studio/R portable/Practice/Stock Analysis")

options(repos = c(CRAN = "http://cran.rstudio.com"))

library(readxl)

vnindex <- read_excel("Historical data.xlsx")

head(vnindex)

#Check the data#

str(vnindex)

class(vnindex)

#Delete volume column#

vnindex$`Vol,` <- NULL

#Check missing data by Amelia package#

library(Amelia)

amelia(vnindex, m = 5)

#Check the normality#

hist(vnindex$Price,probability=T, main="Histogram of normal

data",xlab="Approximately normally distributed data")

+ line(density(vnindex$Price))

#Change the variable names#

names(vnindex)[names(vnindex)=='ï..NgÃ.y'] <- 'Date'

names(vnindex)[names(vnindex) == 'Láº.n.cuá..i'] <- 'Price'

names(vnindex)[names(vnindex) == 'Má.Ÿ'] <- 'Open'

names(vnindex)[names(vnindex) == 'Cao'] <- 'High'

names(vnindex)[names(vnindex) == 'Tháº.p'] <- 'Low'

names(vnindex)[names(vnindex)== 'KL'] <- 'Volume'

names(vnindex)[names(vnindex) == 'X..Thay.Ä.á..i'] <- 'Change'

names(vnindex)[names(vnindex) == 'Volumn'] <- 'Volume'

plot(vnindex)

library(ggplot2)

str(vnindex)

class(vnindex$Date)

class(vnindex)

library(lubridate)

vnindex$Date <- as.Date(vnindex$Date, format= "%d/%m/%Y")

vnindex$Date

class(vnindex$Date)

head(vnindex)

#Draw line graph through plot function#

plot(vnindex$Date, vnindex$Price, type="l", xlab="Date",

ylab="Price" )

#Draw line graph by ggplot package align with pipe function %>%#

library(tidyverse)

vnindex %>% ggplot(aes(x = vnindex$Date, y = vnindex$Price)) +geom_line()

*Vẽ bằng ggplot cho kết quả đẹp hơn*

#Tach cot du lieu#

attach(vnindex)

#Tách cột giá đóng cửa#

closeprice <- rev(vnindex$Price)

head(closeprice)

#Tính lợi suất của chuỗi dữ liệu vnindex#

#Công thức lợi suất P(vnindex) = (p(n) - p(n-1))/p(n-1) Hoặc ln (P(n)/P(n-1))#

n <- length(closeprice)

#Cach 1:

vnreturn <- (closeprice[2:n]-closeprice[1:(n-1)])/closeprice[1:(n-1)]

#Cach 2:

lnvnreturn <- log((closeprice[2:n])/(closeprice[1:(n-1)])

#Load dữ liệu chứng khoán của thị trường thế giới#

install.packages("quantmod")

library(quantmod)

# Đọc dữ liệu DowJones từ nguồn finance.yahoo.com#

getSymbols('DJI', src = 'yahoo', from = "2018-01-01", to = "2019-03-01")

DJI

head(DJI, n = 10)

str(DJI)

class(DJI)

#Chuyển dạng data frame cho DownJone#

DJI <- data.frame(DJI)

#Vẽ đồ thị stock trong gói quantmod#

chartSeries(DJI, theme = chartTheme("white"), subset = 'last 5 months')

#Kiểm tra phân phối dữ liệu lợi suất vnindex#

hist(vnreturn, breaks = 12, col = "blue")

d <- density(vnreturn)

plot(d)

#Đo độ biến động volatility của một số cổ phiếu (trường hợp vnindex)#

Volatility là một phương pháp thống kê đo độ phân tán của các khoản thu hồi được của các Chứng khoán hoặc chỉ số thị trường. Volatility có thể được đo bằng cách sử dụng độ lệch chuẩn giữa các khoản thu hồi từ các loại chứng khoán tương tự hay chỉ số thị trường. Thông thường, nếu Chứng khoán có volatility (độ biến động) cao, thì sẽ bị rủi ro nhiều hơn. (1) Volatility là một biến số trong công thức định giá giá trị Hợp đồng quyền chọn chỉ ra phạm vi mà tài sản underlying (xem UNDERLYING) sẽ giao động trong khoảng thời gian giữa hiện tại và lúc đáo hạn hợp đồng quyền chọn. Volatility, được miêu tả là một hệ số phần trăm trong các công thức định giá giá trị hợp đồng quyền chọn (option pricing formulas), sinh ra từ các hoạt động giao dịch hàng ngày. Cách mà volatility được đo sẽ ảnh hưởng đến giá trị của hệ số được sử dụng. (2) Theo cách diễn đạt khác, volatility đặc trưng cho độ bất ổn hoặc là mức rủi ro trong giao động của giá trị Chứng khoán. Độ bất ổn càng cao đồng nghĩa với giá trị của chứng khoáncàng có nguy cơ bị giãn ra theo nhiều bậc giá trị. Điều này có nghĩa là giá cả của chứng khoán có thể bị thay đổi đột ngột chỉ trong một khoảng thời gian ngắn theo cả hai hướng (tăng đột ngột hoặc giảm đột ngột). Và ngược lại, độ Bất ổn thấp có nghĩa là giá trị của Chứng khoán không bị biến động đột ngột, mà chỉ thay đổi từ từ chắc chắn sau một quá trình. Một phép đo độ bất ổn tương đối của một loại Cổ phần cụ thể đối với thị trường là chỉ số bêta của nó. Chỉ số beta xấp xỉ độ Bất ổn chung của các khoản thu hồi của chứng khoán so với thu hồi của thị trường.

Ví dụ: Một Cổ phần có chỉ số beta giá trị là 1,1 có nghĩa là Chứng khoán sẽ thu hồi lại được 110% so với tổng các khoản thị trường thu hồi được sau một khoảng thời gian xác định. Ngược lại, chỉ số beta là 0,9 sẽ cho khoản thu hồi là 90% của tổng số thu hồi của thị trường.

n <- length(vnindex$Price)

volatility<-sqrt(252)*sqrt(var(log(vnindex$Price[2:n]/vnindex$Price[1:n-1])))

volatility

[1] 0.2141678

Độ biến động của VN-index trong năm qua không cao. (Theo quy ước độ biến đông nhỏ hơn 20% có thể xem là biến động ở mức thấp, nến lớn hơn 40% có thể xem là mức cao). Volatility của VN-index trong khoảng thời gian qua ở mức thấp, nó phản ánh đúng tình trạng ảm đạm của thị trường chứng khoán Việt Nam – hàn thử biểu của nền kinh tế đất nước.

#Value at Risk VaR#

Value at Risk (giá trị chịu rủi ro) của một tài sản (hoặc danh mục, chỉ số) là một con số, ký hiệu là VaR(τ,α), cho biết với độ tin cậy (1- α)100%, sau một chu kỳ thời gian τ thì lợi suất của tài sản sụt giảm không quá VaR(τ,α)100%.

Chu kỳ τ có thể là ngày, tháng, năm nhưng với các chu kỳ thời gian khác đều có thể quy đổi về chu kỳ ngày, α thường chọn là 0.01 hoặc 0.05; VaR cũng được tính cho chuỗi giá, tuy nhiên từ VaR của chuỗi lợi suất ta cũng tính được VaR của chuỗi giá. Có nhiều phương pháp để tính VaR như phương pháp lịch sử, phương pháp tham số, phương pháp mô phỏng,… sự khác nhau nằm ở các phương pháp để các định phân phối của chuỗi lợi suất.

Tính VaR(chu kỳ ngày, α = 0.05)

##Ta đã có chuỗi lợi suất của VN index vnreturn

> library(PerformanceAnalytics) ##Gọi package ra để sử dụng

##VaR lịch sử

> VaR(vnreturn, p=.95, method="historical")

VaR -0.02127722

##VaR tham số (giả thiết phân phối chuẩn)

> VaR(vnreturn, p=.95, method="gaussian")

VaR -0.02211347

##VaR Cornish-Fishe hiệu chỉnh (phương pháp được đánh giá là khắc phục được nhược điểm giả thiết “chuẩn” của chuỗi lợi suất)

> VaR(vnreturn, p=.95, method="modified")

VaR -0.01856129

#VaR của một số cổ phiếu: Ta dùng phương pháp VaR lịch sử để tính VaR theo ngày của một số cổ phiếu phiếu HAG, STB, VT1#

#Hệ số Beta#

Hệ số beta là đại lượng đo tốc độ thay đổi tương đối của một cổ phiếu (hoặc danh mục) với thị trường. Phần lớn hệ số beta là số dương – các cổ phiếu thường biến động cùng chiều thị trường – hệ số beta càng lớn thì tốc độ tăng/ giảm cùng chiều với thị trường càng lớn; khi hệ số beta âm, nó thể hiện cổ phiếu biến động ngược thị trường.

Sau đây, ta sẽ tính chỉ số beta của một số cổ phiếu trên sàn HOSE với chỉ số đại diện thị trường VN index, chuỗi dữ liệu 100 ngày giao dịch. Tính hệ số beta chuỗi lợi suất vnreturn và cổ phiếu HAG gồm 100 ngàygiao dịch:

beta<-cov(hagreturn,vnreturn)/var(vnreturn)

beta

Thursday, 24 January 2019

Machine Learning - Linear Regression

###################################################################

################Machine Learning - Linear Regression#####################

###################################################################

options(repos = c(CRAN = "http://cran.rstudio.com"))

library(readxl)

data <- read_excel("D:\\R studio\\R portable\\Practice\\ML - linear regression\\Bai-tap-thuc-hanh.xlsx")

head(data)

names(data)[names(data)=='Chiều cao (cm)'] <- 'Height'

names(data)[names(data)=='Cân nặng (kg)'] <- 'Weight'

head(data, n=6)

#we need to split the data into a training set and a testing set.

#As their names imply, the training set is used to train and build the model,

#and then this model is tested on the testing set.

#Let’s say we want to have about 75% of the data in the training set and 25% of the data in the testing set

#First, we set the seed so that the results are reproducible

set.seed(123)

#we create a sequence whose length is equal to the number of rows of the dataset. These numbers act as the indices of the dataset. We randomly select 75% of the numbers in the sequence and store it in the variable split

split <- sample(seq_len(nrow(data)), size = floor(0.75 * nrow(data)))

#we copy all the rows which correspond to the indices in split into trainData and all the remaining rows into testData

train.data <- data[split,]

head(train.data)

View(train.data)

test.data <- data[-split,]

head(test.data)

#Building the prediction model

predictionmodel <- lm(Weight~Height, data = train.data )

The above function will try to predict Weight based on the variable Height. Since we are using all the variables present in the dataset, a shorter way to write the above command is: (this is very helpful when are are a large number of variables.)

predictionModel <- lm(Weight ~ ., data = trainData)

summary(predictionmodel)

This will help us determine which variables to include in the model. A linear regression can be represented by the equation: y_i = β_1 x_i1 + β_2 x_i2 + β_3 x_i3 + ⋯ + ε_i where y_i represents the outcome we’re prediction (PE), x_irepresent the various attributes (AT, V, AP, and RH), β represent their coefficients, and ε represents the constant term.

The first column in the summary, namely Estimate gives us these values. The first value corresponds to ε, and the rest of the values correspond to the various βvalues. If the coefficient for a particular attribute is 0 or close to 0, that means it has very little to no effect on the prediction, and hence, can be removed. The standard error column gives an estimation of how much the coefficients may vary from the estimate values. The t value is calculated by dividing the estimate by the standard error column. The last column gives a measure of how likely it is that the coefficient is 0 and is inversely proportional to the t value column. Hence, an attribute with a high absolute value of t, or a very low absolute value of Pr(>|t|) is desirable.

The easiest way to determine which variables are significant is by looking at the stars next to them. The scheme is explained at the bottom of the table. Variables with three stars are most significant, followed by two stars, and finally one. Variables with a period next to them may or may not be significant and are generally not included in prediction models, and variables with nothing next to them are not significant.

In our model, we can see that all our variables are highly significant, so we will leave our prediction model as it is. In case you are dealing with a dataset in which there are one or more variables which are non-significant, it is advisable to test the model by removing one variable at a time. This is because when two variables are highly correlated with each other, they may become non-significant to the model, but when one of them is removed, they could become significant. This is due to multicollinearity. You can find out more about multicollinearity here.

The easiest way to check the accuracy of a model is by looking at the R-squared value. The summary provides two R-squared values, namely Multiple R-squared, and Adjusted R-squared. The Multiple R-squared is calculated as follows:

Multiple R-squared = 1 – SSE/SST where:

SSE is the sum of square of residuals. Residual is the difference between the predicted value and the actual value, and can be accessed by predictionModel$residuals.
SST is the total sum of squares. It is calculated by summing the squares of difference between the actual value and the mean value.

For example, lets say that we have 5, 6, 7, and 8, and a model predicts the outcomes as 4.5, 6.3, 7.2, and 7.9. Then, SSE can be calculated as: SSE = (5 – 4.5) ^ 2 + (6 – 6.3) ^ 2 + (7 – 7.2) ^ 2 + (8 – 7.9) ^ 2; and SST can be calculated as: mean = (5 + 6 + 7 + 8) / 4 = 6.5; SST = (5 – 6.5) ^ 2 + (6 – 6.5) ^ 2 + (7 – 6.5) ^ 2 + (8 – 6.5) ^ 2

The Adjusted R-squared value is similar to the Multiple R-squared value, but it accounts for the number of variables. This means that the Multiple R-squared will always increase when a new variable is added to the prediction model, but if the variable is a non-significant one, the Adjusted R-squared value will decrease. For more info, refer here.

An R-squared value of 1 means that it is a perfect prediction model, and an R-squared value of 0 means that it is of no improvement over the baseline model (the baseline model just predicts the output to always be equal to the mean). From the summary, we can see that our R-squared value is 0.9284, which is very high.

#Testing the prediction model

prediction <- predict(predictionmodel, newdata = test.data)

head(prediction)

head(test.data$Weight)

#calculate the value of R-squared for the prediction model on the test data

SSE <- sum((test.data$Weight - prediction)^2)

SST <- sum((test.data$Weight- mean(test.data$Weight))^2)

1-SSE/SST

Tuesday, 15 January 2019

[R] Toán tử pipe %>%

Vài năm trở lại đây, với sự phổ biến của package dplyr, toán tử %>% trở nên quen thuộc với phần lớn những người sử dụng R. Tuy nhiên, %>% không chỉ gói gọn trong package dplyr, nó đã được sử dụng bên trong rất nhiều các packages khác, chủ yếu liên quan đến việc data visualization.
Trong lập trình, pipe là 1 khái niệm thường dùng để mô tả việc sử dụng đầu ra của một hàm xử lý này để sử dụng như đầu vào của một hàm xử lý khác (chain). Khi khối xử lý kéo dài thành một chuối liên tiếp nhau thì còn được gọi là chainning và nó được gợi nhớ đến việc ghép nối giữa các ống nước trong cuộc sống hàng ngày (pipe)?

Ví dụ:
Trong R, khi ta muốn thể hiện: plot(exp(cumsum(returns)))
có thể viết thành: returns |> cumsum |> exp |> plot
Hoặc một cách "khoa học" hơn: %|>%` = function(x, y) y(x)
-> Console: 1:10 %|>% cumsum %|>% plot

Sự xuất hiện của dplyr

dplyr được thiết kế để xử lý các tác vụ xử lý với data frame, 5 chức năng chính bao gồm: select, filter, mutate, summarise và arrange. dplyr trình làng toán tử đầu tiên %.% nhằm thay thế cho hàm chain (trong package ptools của Peter Melstrup). Quá trình thay đổi được thể hiện như sau:

+ Cách viết truyền thống:

library(hflights)
library(dplyr)
filter(
   summarise(
     select(
       group_by(hflights, Year, Month, DayofMonth), 
       Year:DayofMonth, ArrDelay, DepDelay
     ), 
     arr = mean(ArrDelay, na.rm = TRUE),  
     dep = mean(DepDelay, na.rm = TRUE)
   ), 
   arr > 30 | dep > 30
 )

+ Cách sử dụng hàm Chain (dễ nhìn và trực quan hơn):

chain(
   hflights,
   group_by(Year, Month, DayofMonth),
   select(Year:DayofMonth, ArrDelay, DepDelay),
   summarise(
     arr = mean(ArrDelay, na.rm = TRUE), 
     dep = mean(DepDelay, na.rm = TRUE)
   ),
   filter(arr > 30 | dep > 30)
 )
+ Dùng toán tử %.%
hflights %.%
   group_by(Year, Month, DayofMonth) %.%
   select(Year:DayofMonth, ArrDelay, DepDelay) %.%
   summarise(
     arr = mean(ArrDelay, na.rm = TRUE), 
     dep = mean(DepDelay, na.rm = TRUE)
   ) %.%
   filter(arr > 30 | dep > 30)

Tuy nhiên, kể từ ngày 14/4/2014, %>% được sử dụng làm toán tử pipe trong dplyr, thay hoàn toàn cho %.% và chain.