CDS-102: Lab 13 Workbook

Name: Helena Gray

April 26, 2017

In [2]:
# Run this code block to load the Tidyverse package
.libPaths(new = "~/Rlibs")
library(tidyverse)
library(modelr)
# Load the save file that preloads the dataset and
# the model.sin function
load("lab13.RData")
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats
In [2]:
# To change the size of any plots, copy the code snippet
# below, uncomment it, and set the size of the width
# and height.
# Note: All subsequent figures will use the same size,
# unless you change the options() snippet and run it
# again.

# options(repr.plot.width=6, repr.plot.height=4)
In [4]:
# Test that dataset loaded
glimpse(t.data)
Observations: 8,036
Variables: 4
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ day   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18...
$ year  <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1...
$ t.avg <dbl> 42.3, 39.8, 28.0, 32.3, 20.6, 24.4, 37.5, 36.7, 36.3, 32.5, 3...
In [5]:
# Test that model.sin loaded
print(model.sin(T = 1, n = 0, x = 1/3))
[1] 0.8660254

Lab Task 1

In [6]:
ndays <- nrow(t.data)
t.data <- add_column(t.data, .before=TRUE, number_days=1:ndays)
t.data.filtered <- filter(t.data, t.avg > -99)

Lab Task 2

In [7]:
ggplot(t.data.filtered) + geom_point(aes(x=number_days, y=t.avg), size=0.5)

Lab Task 3

In [8]:
 t.data.y9596<-filter(t.data.filtered, c(year==1995|year==1996))

ggplot(t.data.y9596) + geom_point(aes(x=number_days, y=t.avg), size=0.5)

In the northern hemisphere we tend to be hottest in July and coldest in January. The difference between January 1st and July 1st is roughly 180 days. The time it takes the Earth to make one rotation around the sun is roughly 365 days or a year.

In [10]:
model.T<-365
In [11]:
mod_n_0 <- lm(t.avg~model.sin(T=model.T, n=0, x=number_days),
data=t.data.filtered)

mod_n_1 <- lm(t.avg~model.sin(T=model.T, n=1, x=number_days),
data=t.data.filtered)

mod_n_2 <- lm(t.avg~model.sin(T=model.T, n=2, x=number_days),
data=t.data.filtered)

mod_n_3 <- lm(t.avg~model.sin(T=model.T, n=3, x=number_days),
data=t.data.filtered)

mod_n_4 <- lm(t.avg~model.sin(T=model.T, n=4, x=number_days),
data=t.data.filtered)

mod_n_5 <- lm(t.avg~model.sin(T=model.T, n=5, x=number_days),
data=t.data.filtered)

mod_n_5
mod_n_4
mod_n_3
mod_n_2
mod_n_1
mod_n_0
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 5, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                         56.16  
model.sin(T = model.T, n = 5, x = number_days)  
                                         17.23  
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 4, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                         56.18  
model.sin(T = model.T, n = 4, x = number_days)  
                                         21.57  
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 3, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                         56.19  
model.sin(T = model.T, n = 3, x = number_days)  
                                         20.14  
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 2, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                         56.18  
model.sin(T = model.T, n = 2, x = number_days)  
                                         13.29  
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 1, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                        56.154  
model.sin(T = model.T, n = 1, x = number_days)  
                                         2.862  
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 0, x = number_days), 
    data = t.data.filtered)

Coefficients:
                                   (Intercept)  
                                        56.143  
model.sin(T = model.T, n = 0, x = number_days)  
                                        -8.308  

Lab Task 5

In [12]:
grid <- data_grid(data=t.data.y9596,
number_days=seq_range(number_days, n=1000,
expand=0.05))
grid <- gather_predictions(grid, mod_n_0, mod_n_1, mod_n_2,
mod_n_3, mod_n_4,
mod_n_5, .pred="t.avg")
In [13]:
ggplot(t.data.y9596) +
geom_point(aes(number_days, t.avg)) +
geom_line(data=grid, aes(number_days, t.avg),
color="red", size=1) +
facet_wrap(~model)

Lab Task 6

In [14]:
summary(mod_n_0)$adj.r.squared
sd(summary(mod_n_0)$residuals)

summary(mod_n_1)$adj.r.squared
sd(summary(mod_n_1)$residuals)


summary(mod_n_2)$adj.r.squared
sd(summary(mod_n_2)$residuals)


summary(mod_n_3)$adj.r.squared
sd(summary(mod_n_3)$residuals)


summary(mod_n_4)$adj.r.squared
sd(summary(mod_n_4)$residuals)

summary(mod_n_5)$adj.r.squared
sd(summary(mod_n_5)$residuals)
0.118288503785666
16.0387712780537
0.0138997638112263
16.9616607978321
0.302097593503599
14.2693828523976
0.694422195979437
9.44210325198461
0.797717251700635
7.68222928794176
0.50978190146657
11.9592105773343

Lab Task 7

In [21]:
mod_n_3.5 <- lm(t.avg~model.sin(T=model.T, n=3.5, x=number_days),
data=t.data.filtered)


grid <- data_grid(data=t.data.y9596,
number_days=seq_range(number_days, n=1000,
expand=0.05))

grid <- add_predictions(grid, mod_n_3.5)
print(grid)
ggplot(t.data.y9596) +
geom_point(aes(number_days, t.avg)) +
geom_line(data=grid, aes(number_days, pred),
color="red", size=1) 

summary(mod_n_3.5)$adj.r.squared
sd(summary(mod_n_3.5)$residuals)

model.n<-mod_n_3.5 
# A tibble: 1,000 × 2
   number_days     pred
         <dbl>    <dbl>
1    -17.25000 37.88593
2    -16.48273 37.73637
3    -15.71547 37.59002
4    -14.94820 37.44693
5    -14.18093 37.30710
6    -13.41366 37.17056
7    -12.64640 37.03735
8    -11.87913 36.90748
9    -11.11186 36.78097
10   -10.34459 36.65784
# ... with 990 more rows
0.79871315879651
7.66329483397451

Lab Task 8

In [22]:
summary(model.n)
Call:
lm(formula = t.avg ~ model.sin(T = model.T, n = 3.5, x = number_days), 
    data = t.data.filtered)

Residuals:
    Min      1Q  Median      3Q     Max 
-30.710  -5.080  -0.176   4.880  31.957 

Coefficients:
                                                 Estimate Std. Error t value
(Intercept)                                      56.19194    0.08566   656.0
model.sin(T = model.T, n = 3.5, x = number_days) 21.58927    0.12114   178.2
                                                 Pr(>|t|)    
(Intercept)                                        <2e-16 ***
model.sin(T = model.T, n = 3.5, x = number_days)   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7.664 on 8003 degrees of freedom
Multiple R-squared:  0.7987,	Adjusted R-squared:  0.7987 
F-statistic: 3.176e+04 on 1 and 8003 DF,  p-value: < 2.2e-16
In [23]:
A<- 56.19194
B<- 21.58927 
n<- 3.5
#T

Lab Task 9

In [25]:
grid.final <- data_grid(t.data.filtered,
number_days=seq_range(number_days, n=10000,
expand=0.01))
grid.final <- add_predictions(grid.final, mod_n_3.5)
t.data.with.resid <- add_residuals(t.data.filtered, mod_n_3.5)
In [26]:
ggplot(t.data.with.resid) +
geom_point(aes(number_days, t.avg), size=0.5) +
geom_line(data=grid.final,
mapping=aes(number_days, pred),
color="red", size=1)
In [27]:
ggplot(t.data.with.resid) +
geom_point(aes(number_days, resid)) +
geom_ref_line(h=0)
In [28]:
ggplot(t.data.with.resid) +
geom_histogram(aes(resid), binwidth=1,
color="cyan4", fill="cyan2")
In [ ]: