# Run this code block to load the Tidyverse package
.libPaths(new = "~/Rlibs")
library(tidyverse)
# The dataset is in the file "MDWASHDC_JAN1995_DEC2016.csv"
dc.temps<-read.csv("MDWASHDC_JAN1995_DEC2016.csv")
head(dc.temps)
The code below generates a summary statistics (mean(), median(), min(), max(), and sd()) report of the average temperature grouped by month using the summarise() function.
by_month<-group_by(dc.temps,month)
temps.table<-summarise(by_month,
mean=mean(t.avg), max=max(t.avg),min=min(t.avg),med=median(t.avg), sd=sd(t.avg))
temps.table
The code below plots the Probability Mass Function (PMF) histogram of the average daily temperatures in the full dataset for each month of the year using the geom_histogram() and ggplot() functions. It uses the facet_wrap() function to create this as a 12 panel plot.
all.months.temps<-ggplot(dc.temps) +
geom_histogram(mapping = aes(x =t.avg, y = ..density..),
binwidth = 1, fill = "cyan3", color = "cyan4") + facet_wrap(~month)
ggsave("all.months.temps.png", plot = all.months.temps, device="png", scale=1, width=5, height=4)
all.months.temps
The code below creates the normal distribution model for the month of June (all years) using the summary statistics computed in task 1 by generating the Probability Density Function (PDF). It then stores the computed values of the model in a new two-column tibble named jun.model.
dc.temps.june<-filter(dc.temps,month==6)
jun.pdf<-dnorm(x = dc.temps.june$t.avg, mean =74.23167, sd = 14.716578)
jun.model<-tibble(temps=dc.temps.june$t.avg,PDF=jun.pdf)
The code below creates a new plot containing the average daily temperature PMF histogram and the normal distribution model for June (all years). Note whether or not the model visually agrees with the histogram.
jun.ggplot<-ggplot(data=dc.temps.june) + geom_histogram(binwidth = .5, mapping = aes(x=t.avg), alpha=.5)
data.ggplot.full <- ggplot_build(jun.ggplot)
data.ggplot.table <- data.ggplot.full$data[[1]]
histogram.table <- tibble(x = data.ggplot.table$x, density = data.ggplot.table$density, frequency = data.ggplot.table$count)
mean.june<-mean(dc.temps.june$t.avg)
sd.june<-sd(dc.temps.june$t.avg)
options(repr.plot.width = 6, repr.plot.height = 4)
data.ggplot.june<-ggplot(data=histogram.table) + geom_col(mapping = aes(x=x, y=density), alpha=.5) + stat_function(fun=dnorm, args=list(mean=mean.june,sd=sd.june), color= "red")
ggsave("data.ggplot.june.png", plot = data.ggplot.june, device="png", scale=1, width=5, height=4)
data.ggplot.june
The code below creates a qqplot for the average temperature distribution in June. A theoretical line is computed and included for comparison.
# Find the 1st and 3rd quartiles (0.25 and 0.75 percentiles)
qq_y <- quantile(dc.temps.june$t.avg, c(0.25, 0.75))
# Find the matching normal values on the x-axis
qq_x <- qnorm(c(0.25, 0.75))
# Compute line slope
qq_slope <- diff(qq_y) / diff(qq_x)
# Compute line intercept
qq_int <- qq_y[1] - qq_slope * qq_x[1]
qqplot.june<-ggplot(dc.temps.june) +
geom_qq(aes(sample = t.avg), color = "cyan3") +
geom_abline(intercept = qq_int, slope = qq_slope, color = "black")
ggsave("qqplot.june.png", plot = qqplot.june, device="png", scale=1, width=5, height=4)
qqplot.june
The code below creates a 12 panel series of qqplots (without theoretical lines) for each month (all years) using facet_wrap(). Note whether the trend for June applies to the other months.
all.months.qqplot<-ggplot(dc.temps) + geom_qq(aes(sample = t.avg), color = "cyan3") + facet_wrap(~month)
ggsave("all.months.qqplot.png",plot = all.months.qqplot, device="png", scale=1, width=5, height=4)
all.months.qqplot
The normal distribution model is used to compute the temperature of the 0.10 percentile for the month of June (all years) using the qnorm() function.
# The top 90% of temperatures are the temperatures in the 10th percentile or higher
june.mean =74.23167
june.sd = 14.716578
june.p10 <- qnorm(p = 0.10, mean = june.mean, sd = june.sd)
june.p10
The normal distribution model is used to compute the percentile of the temperature 83◦F for the month of June (all years) using the pnorm() function.
pnorm(q = 83, mean = june.mean, sd = june.sd)
For the month of June, what is the probability that any given day will have a temperature of 83◦F or higher? The code below uses the pnorm() function to find this probability.
How cold are the coldest 10% of days? The qnorm() function is used to find this average temperature of the 10% coldest days.
pnorm(83, mean=june.mean, sd=june.sd, lower.tail=FALSE)
qnorm(0.1, mean=june.mean, sd=june.sd)
Report the mean for the month of March with a 68% and a 95% confidence interval.
ci.95<- 2* june.sd
cat("The 95% confidence interval for the unfiltered dataset is ", june.mean, "+-",ci.95,"\n")
cat("The 68% confidence interval for the unfiltered dataset is ", june.mean, "+-",june.sd)
june.mean+ci.95
june.mean-ci.95
june.mean+june.sd
june.mean-june.sd