搜索
查看: 70|回复: 0

[R] 第18章

[复制链接]

24

主题

29

帖子

354

积分

中级会员

Rank: 3Rank: 3

积分
354
发表于 2018-12-1 15:16:56 | 显示全部楼层 |阅读模式
本帖最后由 Lespremeto 于 2018-12-1 15:16 编辑

# 第18章 模型构建
# 准备工作
[AppleScript] 纯文本查看 复制代码
library(ggplot2)

library(tidyverse)

library(modelr)

options(na.action = na.warn)

library(nycflights13)

library(lubridate)


#2 为什么质量差的钻石更贵
# 最差的钻石颜色是J(微黄),最差的纯净度是H(肉眼可见内含物)
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds, aes(cut, price)) +

  geom_boxplot()



ggplot(diamonds, aes(color, price)) +

  geom_boxplot()



ggplot(diamonds, aes(clarity, price)) +

  geom_boxplot()


#2.1 价格与质量
# 质量差的钻石更贵的原因——重量这一混淆变量
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds, aes(carat, price)) +

  geom_hex(bins = 50)


# 先对钻石数据集进行调整
# 重点关注小于2.5克拉的钻石(全部数据的99.7%)
# 对重量和价格变量进行对数转换
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds %>%

  filter(carat <= 2.5) %>%

  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +

  geom_hex(bins = 50)


# 对数转换在示例中非常有用,可让模式变为线性
[AppleScript] 纯文本查看 复制代码
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)



grid <- diamonds2 %>%

  data_grid(carat = seq_range(carat, 20)) %>%

  mutate(lcarat = log2(carat)) %>%

  add_predictions(mod_diamond, "lprice") %>%

  mutate(price = 2 ^ lprice)



ggplot(diamonds2, aes(carat, price)) +

  geom_hex(bins = 50) +

  geom_line(data = grid, color = "red", size = 1)


# 检查一下残差,验证是否成功移除了强烈的线性模式
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +

  geom_hex(bins = 50)


# 使用残差代替price重新绘图
[AppleScript] 纯文本查看 复制代码
ggplot(diamonds2, aes(cut, lresid)) + 

  geom_boxplot()

ggplot(diamonds2, aes(color, lresid)) + 

  geom_boxplot()

ggplot(diamonds2, aes(clarity, lresid)) + 

  geom_boxplot()


#2.2 更复杂的模型
[AppleScript] 纯文本查看 复制代码
mod_diamond2 <- lm(

  lprice ~ lcarat + color +cut +clarity,

  data = diamonds2

)



grid <- diamonds2 %>%

  data_grid(cut, .model = mod_diamond2) %>%

  add_predictions(mod_diamond2)

grid



ggplot(grid, aes(cut, pred)) + 

  geom_point()


# 对于连续变量,模型使用中位数,对于分类变量,模型使用最常见的值
[AppleScript] 纯文本查看 复制代码
diamonds2 <- diamonds2 %>%

  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +

  geom_hex(bins = 50)



diamonds2 %>%

  filter(abs(lresid2) > 1) %>%

  add_predictions(mod_diamond2) %>%

  mutate(pred = round(2 ^ pred)) %>%

  select(price, pred, carat:table, x:z) %>%

  arrange(price)


#3 哪些因素影响了每日航班数量
# 计算每天的航班数量
[AppleScript] 纯文本查看 复制代码
daily <- flights %>%

  mutate(date = make_date(year, month, day)) %>%

  group_by(date) %>%

  summarize(n = n())

ggplot(daily, aes(date, n)) +

  geom_line()


#3.1 一周中的每一天
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  mutate(wday = wday(date, label = T))

ggplot(daily, aes(wday, n)) +

  geom_boxplot()


# 去除强烈模式的方法是使用模型,首先拟合这个模型,并将预测值覆盖在原始数据上
[AppleScript] 纯文本查看 复制代码
mod <- lm(n ~ wday, data = daily)

grid <- daily %>%

  data_grid(wday) %>%

  add_predictions(mod, "n")



ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red", size = 4)


# 计算残差,对其可视化表示
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  add_residuals(mod)

daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line()

# 该图去除了大部分周内效应
# 我们的模型似乎从6月开始失效
[AppleScript] 纯文本查看 复制代码
ggplot(daily, aes(date, resid, color = wday)) +

  geom_ref_line(h = 0) +

  geom_line()


# 某几天的航班数量远少于预期
[AppleScript] 纯文本查看 复制代码
daily %>%

  filter(resid < -100)



# 从整年来看,似乎有某种更平滑的长期趋势
[AppleScript] 纯文本查看 复制代码
daily %>%

  ggplot(aes(date, resid)) +

  geom_ref_line(h = 0) +

  geom_line(color = "grey50") +

  geom_smooth(se = F, span = .2)


#3.2 季节性周六效应
[AppleScript] 纯文本查看 复制代码
daily  %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n)) +

  geom_point() +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )


# 这种模式可能是暑假造成的
[AppleScript] 纯文本查看 复制代码
term <- function(date){

  cut(date,

      breaks = ymd(20130101, 20130605, 20130825, 20140101),

      labels = c("spring", "summer", "fall")

      )

}

daily <- daily %>%

  mutate(term = term(date))

daily %>%

  filter(wday == "Sat") %>%

  ggplot(aes(date, n, color = term)) +

  geom_point(alpha = 1/3) +

  geom_line() +

  scale_x_date(

    NULL,

    date_breaks = "1 month",

    date_labels = "%b"

  )



daily %>%

  ggplot(aes(wday, n, color =term)) +

  geom_boxplot()

# 不同学期间的差别非常大,因此应拟合去除学期周内效应的模型
[AppleScript] 纯文本查看 复制代码
mod1 <- lm(n ~ wday, data = daily)

mod2 <- lm(n ~ wday + term, data = daily)



daily %>%

  gather_residuals(without_term = mod1, with_term = mod2) %>%

  ggplot(aes(date, resid, color = model)) +

  geom_line(alpha = .75)

# 但效果差强人意,将模型预测值覆盖在原始数据上,就可看出问题
[AppleScript] 纯文本查看 复制代码
daily <- daily %>%

  data_grid(wday, term) %>%

  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +

  geom_boxplot() +

  geom_point(data = grid, color = "red") +

  facet_wrap( ~ term)



mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%

  add_residuals(mod3, "resid") %>%

  ggplot(aes(date, resid)) + 

  geom_hline(yintercept = 0, size = 2, color = "white") +

  geom_line()


# 计算出的变量
[AppleScript] 纯文本查看 复制代码
compute_vars <- function(data){

  data %>%

    mutate(

      term = term(date),

      wday =wday(date, label = T)

    )

}

# 另一种方法,直接在模型公式中进行转换
[AppleScript] 纯文本查看 复制代码
wday2 <- function(x)wday(x, label = T)

mod3 <- lm(n ~ wday2(date) * term(date), data = daily)


# 年度时间
# 使用自然样条法拟合一个年度平滑曲线
[AppleScript] 纯文本查看 复制代码
library(splines)

library(MASS)

mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)



daily %>%

  data_grid(wday, date = seq_range(date, n = 13)) %>%

  add_predictions(mod) %>%

  ggplot(aes(date, pred, color = wday)) +

  geom_line() +

  geom_point()



本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?立即注册

x



上一篇:慧美——R for data science 第19章 使用purrr和broom处理多个模型
下一篇:第19章
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|生信技能树    

GMT+8, 2018-12-12 05:24 , Processed in 0.030070 second(s), 27 queries .

Powered by Discuz! X3.2

© 2001-2013 Comsenz Inc.