Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7120
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import pandas
11
import numpy as np
12
import statsmodels.formula.api as smf
13
14
import thinkplot
15
import thinkstats2
16
import regression
17
import timeseries
18
19
20
def RunQuadraticModel(daily):
21
"""Runs a linear model of prices versus years.
22
23
daily: DataFrame of daily prices
24
25
returns: model, results
26
"""
27
daily['years2'] = daily.years**2
28
model = smf.ols('ppg ~ years + years2', data=daily)
29
results = model.fit()
30
return model, results
31
32
33
def PlotQuadraticModel(daily, name):
34
"""
35
"""
36
model, results = RunQuadraticModel(daily)
37
regression.SummarizeResults(results)
38
timeseries.PlotFittedValues(model, results, label=name)
39
thinkplot.Save(root='timeseries11',
40
title='fitted values',
41
xlabel='years',
42
xlim=[-0.1, 3.8],
43
ylabel='price per gram ($)')
44
45
timeseries.PlotResidualPercentiles(model, results)
46
thinkplot.Save(root='timeseries12',
47
title='residuals',
48
xlabel='years',
49
ylabel='price per gram ($)')
50
51
years = np.linspace(0, 5, 101)
52
thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
53
timeseries.PlotPredictions(daily, years, func=RunQuadraticModel)
54
thinkplot.Save(root='timeseries13',
55
title='predictions',
56
xlabel='years',
57
xlim=[years[0]-0.1, years[-1]+0.1],
58
ylabel='price per gram ($)')
59
60
61
def PlotEwmaPredictions(daily, name):
62
"""
63
"""
64
65
# use EWMA to estimate slopes
66
filled = timeseries.FillMissing(daily)
67
filled['slope'] = pandas.ewma(filled.ppg.diff(), span=180)
68
filled[-1:]
69
70
# extract the last inter and slope
71
start = filled.index[-1]
72
inter = filled.ewma[-1]
73
slope = filled.slope[-1]
74
75
# reindex the DataFrame, adding a year to the end
76
dates = pandas.date_range(filled.index.min(),
77
filled.index.max() + np.timedelta64(365, 'D'))
78
predicted = filled.reindex(dates)
79
80
# generate predicted values and add them to the end
81
predicted['date'] = predicted.index
82
one_day = np.timedelta64(1, 'D')
83
predicted['days'] = (predicted.date - start) / one_day
84
predict = inter + slope * predicted.days
85
predicted.ewma.fillna(predict, inplace=True)
86
87
# plot the actual values and predictions
88
thinkplot.Scatter(daily.ppg, alpha=0.1, label=name)
89
thinkplot.Plot(predicted.ewma)
90
thinkplot.Save()
91
92
93
class SerialCorrelationTest(thinkstats2.HypothesisTest):
94
"""Tests serial correlations by permutation."""
95
96
def TestStatistic(self, data):
97
"""Computes the test statistic.
98
99
data: tuple of xs and ys
100
"""
101
series, lag = data
102
test_stat = abs(thinkstats2.SerialCorr(series, lag))
103
return test_stat
104
105
def RunModel(self):
106
"""Run the model of the null hypothesis.
107
108
returns: simulated data
109
"""
110
series, lag = self.data
111
permutation = series.reindex(np.random.permutation(series.index))
112
return permutation, lag
113
114
115
def TestSerialCorr(daily):
116
"""Tests serial correlations in daily prices and their residuals.
117
118
daily: DataFrame of daily prices
119
"""
120
# test the correlation between consecutive prices
121
series = daily.ppg
122
test = SerialCorrelationTest((series, 1))
123
pvalue = test.PValue()
124
print(test.actual, pvalue)
125
126
# test for serial correlation in residuals of the linear model
127
_, results = timeseries.RunLinearModel(daily)
128
series = results.resid
129
test = SerialCorrelationTest((series, 1))
130
pvalue = test.PValue()
131
print(test.actual, pvalue)
132
133
# test for serial correlation in residuals of the quadratic model
134
_, results = RunQuadraticModel(daily)
135
series = results.resid
136
test = SerialCorrelationTest((series, 1))
137
pvalue = test.PValue()
138
print(test.actual, pvalue)
139
140
141
def main(name):
142
transactions = timeseries.ReadData()
143
144
dailies = timeseries.GroupByQualityAndDay(transactions)
145
name = 'high'
146
daily = dailies[name]
147
148
PlotQuadraticModel(daily, name)
149
TestSerialCorr(daily)
150
PlotEwmaPredictions(daily, name)
151
152
153
if __name__ == '__main__':
154
import sys
155
main(*sys.argv)
156
157