搜索
查看: 1239|回复: 0

[R] 第18章

[复制链接]

31

主题

36

帖子

1439

积分

金牌会员

Rank: 6Rank: 6

积分
1439
发表于 2018-12-1 15:16:56 | 显示全部楼层 |阅读模式
本帖最后由 Lespremeto 于 2018-12-1 15:16 编辑

# 第18章 模型构建
# 准备工作
[mw_shl_code=applescript,true]library(ggplot2)

library(tidyverse)

library(modelr)

options(na.action = na.warn)

library(nycflights13)

library(lubridate)

[/mw_shl_code]
#2 为什么质量差的钻石更贵
# 最差的钻石颜色是J(微黄),最差的纯净度是H(肉眼可见内含物)
[mw_shl_code=applescript,true]ggplot(diamonds, aes(cut, price)) +

  geom_boxplot()



ggplot(diamonds, aes(color, price)) +

  geom_boxplot()



ggplot(diamonds, aes(clarity, price)) +

  geom_boxplot()[/mw_shl_code]

#2.1 价格与质量
# 质量差的钻石更贵的原因——重量这一混淆变量
[mw_shl_code=applescript,true]ggplot(diamonds, aes(carat, price)) +

  geom_hex(bins = 50)[/mw_shl_code]

# 先对钻石数据集进行调整
# 重点关注小于2.5克拉的钻石(全部数据的99.7%)
# 对重量和价格变量进行对数转换
[mw_shl_code=applescript,true]diamonds2 <- diamonds %>%

  filter(carat <= 2.5) %>%

  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +

  geom_hex(bins = 50)[/mw_shl_code]

# 对数转换在示例中非常有用,可让模式变为线性
[mw_shl_code=applescript,true]mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)



grid <- diamonds2 %>%

  data_grid(carat = seq_range(carat, 20)) %>%

  mutate(lcarat = log2(carat)) %>%

  add_predictions(mod_diamond, "lprice") %>%

  mutate(price = 2 ^ lprice)



ggplot(diamonds2, aes(carat, price)) +

  geom_hex(bins = 50) +

  geom_line(data = grid, color = "red", size = 1)[/mw_shl_code]

# 检查一下残差,验证是否成功移除了强烈的线性模式
[mw_shl_code=applescript,true]diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +

  geom_hex(bins = 50)[/mw_shl_code]

# 使用残差代替price重新绘图
[mw_shl_code=applescript,true]ggplot(diamonds2, aes(cut, lresid)) +

  geom_boxplot()

ggplot(diamonds2, aes(color, lresid)) +

  geom_boxplot()

ggplot(diamonds2, aes(clarity, lresid)) +

  geom_boxplot()[/mw_shl_code]

#2.2 更复杂的模型
[mw_shl_code=applescript,true]mod_diamond2 <- lm(

  lprice ~ lcarat + color +cut +clarity,

  data = diamonds2

)



grid <- diamonds2 %>%

  data_grid(cut, .model = mod_diamond2) %>%

  add_predictions(mod_diamond2)

grid



ggplot(grid, aes(cut, pred)) +

  geom_point()[/mw_shl_code]

# 对于连续变量,模型使用中位数,对于分类变量,模型使用最常见的值
[mw_shl_code=applescript,true]diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +

  geom_hex(bins = 50)



diamonds2 %>%

  filter(abs(lresid2) > 1) %>%

  add_predictions(mod_diamond2) %>%

  mutate(pred = round(2 ^ pred)) %>%

  select(price, pred, carat:table, x:z) %>%

  arrange(price)[/mw_shl_code]

#3 哪些因素影响了每日航班数量
# 计算每天的航班数量
[mw_shl_code=applescript,true]daily <- flights %>%

  mutate(date = make_date(year, month, day)) %>%

  group_by(date) %>%

  summarize(n = n())

ggplot(daily, aes(date, n)) +

  geom_line()[/mw_shl_code]

#3.1 一周中的每一天
[mw_shl_code=applescript,true]daily <- daily %>%

  mutate(wday = wday(date, label = T))

ggplot(daily, aes(wday, n)) +

  geom_boxplot()[/mw_shl_code]

# 去除强烈模式的方法是使用模型,首先拟合这个模型,并将预测值覆盖在原始数据上
[mw_shl_code=applescript,true]mod <- lm(n ~ wday, data = daily)

grid <- daily %>%

  data_grid(wday) %>%

  add_predictions(mod, "n")



ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red", size = 4)[/mw_shl_code]

# 计算残差,对其可视化表示
[mw_shl_code=applescript,true]daily <- daily %>%

  add_residuals(mod)

daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line()[/mw_shl_code]
# 该图去除了大部分周内效应
# 我们的模型似乎从6月开始失效
[mw_shl_code=applescript,true]ggplot(daily, aes(date, resid, color = wday)) +

  geom_ref_line(h = 0) +

  geom_line()[/mw_shl_code]

# 某几天的航班数量远少于预期
[mw_shl_code=applescript,true]daily %>%

  filter(resid < -100)[/mw_shl_code]


# 从整年来看,似乎有某种更平滑的长期趋势
[mw_shl_code=applescript,true]daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line(color = "grey50") +

  geom_smooth(se = F, span = .2)[/mw_shl_code]

#3.2 季节性周六效应
[mw_shl_code=applescript,true]daily  %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n)) +

  geom_point() +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )[/mw_shl_code]

# 这种模式可能是暑假造成的
[mw_shl_code=applescript,true]term <- function(date){

  cut(date,

      breaks = ymd(20130101, 20130605, 20130825, 20140101),

      labels = c("spring", "summer", "fall")

      )

}

daily <- daily %>%

  mutate(term = term(date))

daily %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n, color = term)) +

  geom_point(alpha = 1/3) +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )



daily %>%

  ggplot(aes(wday, n, color =term)) +

  geom_boxplot()[/mw_shl_code]
# 不同学期间的差别非常大,因此应拟合去除学期周内效应的模型
[mw_shl_code=applescript,true]mod1 <- lm(n ~ wday, data = daily)

mod2 <- lm(n ~ wday + term, data = daily)



daily %>%

  gather_residuals(without_term = mod1, with_term = mod2) %>%

  ggplot(aes(date, resid, color = model)) +

  geom_line(alpha = .75)[/mw_shl_code]
# 但效果差强人意,将模型预测值覆盖在原始数据上,就可看出问题
[mw_shl_code=applescript,true]daily <- daily %>%

  data_grid(wday, term) %>%

  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red") +

  facet_wrap( ~ term)



mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%

  add_residuals(mod3, "resid") %>%

  ggplot(aes(date, resid)) +

  geom_hline(yintercept = 0, size = 2, color = "white") +

  geom_line()[/mw_shl_code]

# 计算出的变量
[mw_shl_code=applescript,true]compute_vars <- function(data){

  data %>%

    mutate(

      term = term(date),

      wday =wday(date, label = T)

    )

}[/mw_shl_code]
# 另一种方法,直接在模型公式中进行转换
[mw_shl_code=applescript,true]wday2 <- function(x)wday(x, label = T)

mod3 <- lm(n ~ wday2(date) * term(date), data = daily)[/mw_shl_code]

# 年度时间
# 使用自然样条法拟合一个年度平滑曲线
[mw_shl_code=applescript,true]library(splines)

library(MASS)

mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)



daily %>%

  data_grid(wday, date = seq_range(date, n = 13)) %>%

  add_predictions(mod) %>%

  ggplot(aes(date, pred, color = wday)) +

  geom_line() +

  geom_point()[/mw_shl_code]


year geom smooth.png
wday n red.png
wday color term.png
term fitting.png
term fitting data.png
resid.png
re cut lresid.png
re color lresid.png
re clarity lresid.png
price carat hex.png
lresid2 hex.png
lcarat lprice.png
june geomrefline.png
hex line.png
flights resid.png
flights per day.png
date n.png
cut price.png
cut pred.png
color price.png
clarity price.png



上一篇:慧美——R for data science 第19章 使用purrr和broom处理多个模型
下一篇:第19章
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|生信技能树 ( 粤ICP备15016384号  

GMT+8, 2020-4-7 01:16 , Processed in 0.027022 second(s), 28 queries .

Powered by Discuz! X3.2

© 2001-2013 Comsenz Inc.