搜索
查看: 926|回复: 0

[R] 第18章

[复制链接]

31

主题

36

帖子

1175

积分

金牌会员

Rank: 6Rank: 6

积分
1175
发表于 2018-12-1 15:16:56 | 显示全部楼层 |阅读模式
本帖最后由 Lespremeto 于 2018-12-1 15:16 编辑

# 第18章 模型构建
# 准备工作
[AppleScript] 纯文本查看 复制代码
library(ggplot2)

library(tidyverse)

library(modelr)

options(na.action = na.warn)

library(nycflights13)

library(lubridate)


#2 为什么质量差的钻石更贵
# 最差的钻石颜色是J(微黄),最差的纯净度是H(肉眼可见内含物)
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds, aes(cut, price)) +

  geom_boxplot()



ggplot(diamonds, aes(color, price)) +

  geom_boxplot()



ggplot(diamonds, aes(clarity, price)) +

  geom_boxplot()


#2.1 价格与质量
# 质量差的钻石更贵的原因——重量这一混淆变量
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds, aes(carat, price)) +

  geom_hex(bins = 50)


# 先对钻石数据集进行调整
# 重点关注小于2.5克拉的钻石(全部数据的99.7%)
# 对重量和价格变量进行对数转换
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds %>%

  filter(carat <= 2.5) %>%

  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +

  geom_hex(bins = 50)


# 对数转换在示例中非常有用,可让模式变为线性
[AppleScript] 纯文本查看 复制代码
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)



grid <- diamonds2 %>%

  data_grid(carat = seq_range(carat, 20)) %>%

  mutate(lcarat = log2(carat)) %>%

  add_predictions(mod_diamond, "lprice") %>%

  mutate(price = 2 ^ lprice)



ggplot(diamonds2, aes(carat, price)) +

  geom_hex(bins = 50) +

  geom_line(data = grid, color = "red", size = 1)


# 检查一下残差,验证是否成功移除了强烈的线性模式
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +

  geom_hex(bins = 50)


# 使用残差代替price重新绘图
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds2, aes(cut, lresid)) + 

  geom_boxplot()

ggplot(diamonds2, aes(color, lresid)) + 

  geom_boxplot()

ggplot(diamonds2, aes(clarity, lresid)) + 

  geom_boxplot()


#2.2 更复杂的模型
[AppleScript] 纯文本查看 复制代码
mod_diamond2 <- lm(

  lprice ~ lcarat + color +cut +clarity,

  data = diamonds2

)



grid <- diamonds2 %>%

  data_grid(cut, .model = mod_diamond2) %>%

  add_predictions(mod_diamond2)

grid



ggplot(grid, aes(cut, pred)) + 

  geom_point()


# 对于连续变量,模型使用中位数,对于分类变量,模型使用最常见的值
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +

  geom_hex(bins = 50)



diamonds2 %>%

  filter(abs(lresid2) > 1) %>%

  add_predictions(mod_diamond2) %>%

  mutate(pred = round(2 ^ pred)) %>%

  select(price, pred, carat:table, x:z) %>%

  arrange(price)


#3 哪些因素影响了每日航班数量
# 计算每天的航班数量
[AppleScript] 纯文本查看 复制代码
daily <- flights %>%

  mutate(date = make_date(year, month, day)) %>%

  group_by(date) %>%

  summarize(n = n())

ggplot(daily, aes(date, n)) +

  geom_line()


#3.1 一周中的每一天
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  mutate(wday = wday(date, label = T))

ggplot(daily, aes(wday, n)) +

  geom_boxplot()


# 去除强烈模式的方法是使用模型,首先拟合这个模型,并将预测值覆盖在原始数据上
[AppleScript] 纯文本查看 复制代码
mod <- lm(n ~ wday, data = daily)

grid <- daily %>%

  data_grid(wday) %>%

  add_predictions(mod, "n")



ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red", size = 4)


# 计算残差,对其可视化表示
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  add_residuals(mod)

daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line()

# 该图去除了大部分周内效应
# 我们的模型似乎从6月开始失效
[AppleScript] 纯文本查看 复制代码
ggplot(daily, aes(date, resid, color = wday)) +

  geom_ref_line(h = 0) +

  geom_line()


# 某几天的航班数量远少于预期
[AppleScript] 纯文本查看 复制代码
daily %>%

  filter(resid < -100)



# 从整年来看,似乎有某种更平滑的长期趋势
[AppleScript] 纯文本查看 复制代码
daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line(color = "grey50") +

  geom_smooth(se = F, span = .2)


#3.2 季节性周六效应
[AppleScript] 纯文本查看 复制代码
daily  %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n)) +

  geom_point() +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )


# 这种模式可能是暑假造成的
[AppleScript] 纯文本查看 复制代码
term <- function(date){

  cut(date,

      breaks = ymd(20130101, 20130605, 20130825, 20140101),

      labels = c("spring", "summer", "fall")

      )

}

daily <- daily %>%

  mutate(term = term(date))

daily %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n, color = term)) +

  geom_point(alpha = 1/3) +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )



daily %>%

  ggplot(aes(wday, n, color =term)) +

  geom_boxplot()

# 不同学期间的差别非常大,因此应拟合去除学期周内效应的模型
[AppleScript] 纯文本查看 复制代码
mod1 <- lm(n ~ wday, data = daily)

mod2 <- lm(n ~ wday + term, data = daily)



daily %>%

  gather_residuals(without_term = mod1, with_term = mod2) %>%

  ggplot(aes(date, resid, color = model)) +

  geom_line(alpha = .75)

# 但效果差强人意,将模型预测值覆盖在原始数据上,就可看出问题
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  data_grid(wday, term) %>%

  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red") +

  facet_wrap( ~ term)



mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%

  add_residuals(mod3, "resid") %>%

  ggplot(aes(date, resid)) + 

  geom_hline(yintercept = 0, size = 2, color = "white") +

  geom_line()


# 计算出的变量
[AppleScript] 纯文本查看 复制代码
compute_vars <- function(data){

  data %>%

    mutate(

      term = term(date),

      wday =wday(date, label = T)

    )

}

# 另一种方法,直接在模型公式中进行转换
[AppleScript] 纯文本查看 复制代码
wday2 <- function(x)wday(x, label = T)

mod3 <- lm(n ~ wday2(date) * term(date), data = daily)


# 年度时间
# 使用自然样条法拟合一个年度平滑曲线
[AppleScript] 纯文本查看 复制代码
library(splines)

library(MASS)

mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)



daily %>%

  data_grid(wday, date = seq_range(date, n = 13)) %>%

  add_predictions(mod) %>%

  ggplot(aes(date, pred, color = wday)) +

  geom_line() +

  geom_point()



year geom smooth.png
wday n red.png
wday color term.png
term fitting.png
term fitting data.png
resid.png
re cut lresid.png
re color lresid.png
re clarity lresid.png
price carat hex.png
lresid2 hex.png
lcarat lprice.png
june geomrefline.png
hex line.png
flights resid.png
flights per day.png
date n.png
cut price.png
cut pred.png
color price.png
clarity price.png



上一篇:慧美——R for data science 第19章 使用purrr和broom处理多个模型
下一篇:第19章
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|生信技能树 ( 粤ICP备15016384号  

GMT+8, 2019-11-21 02:10 , Processed in 0.029839 second(s), 30 queries .

Powered by Discuz! X3.2

© 2001-2013 Comsenz Inc.