Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News Sign UpSign In
| Download

Think Stats by Allen B. Downey Think Stats is an introduction to Probability and Statistics for Python programmers.

This is the accompanying code for this book.

Website: http://greenteapress.com/wp/think-stats-2e/

Views: 7089
License: GPL3
1
"""This file contains code for use with "Think Stats",
2
by Allen B. Downey, available from greenteapress.com
3
4
Copyright 2014 Allen B. Downey
5
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
6
"""
7
8
from __future__ import print_function, division
9
10
import pandas
11
import numpy as np
12
import statsmodels.formula.api as smf
13
import statsmodels.tsa.stattools as smtsa
14
15
import matplotlib.pyplot as pyplot
16
17
import thinkplot
18
import thinkstats2
19
20
FORMATS = ['png']
21
22
def ReadData():
23
"""Reads data about cannabis transactions.
24
25
http://zmjones.com/static/data/mj-clean.csv
26
27
returns: DataFrame
28
"""
29
transactions = pandas.read_csv('mj-clean.csv', parse_dates=[5])
30
return transactions
31
32
33
def tmean(series):
34
"""Computes a trimmed mean.
35
36
series: Series
37
38
returns: float
39
"""
40
t = series.values
41
n = len(t)
42
if n <= 3:
43
return t.mean()
44
trim = max(1, n//10)
45
return np.mean(sorted(t)[trim:n-trim])
46
47
48
def GroupByDay(transactions, func=np.mean):
49
"""Groups transactions by day and compute the daily mean ppg.
50
51
transactions: DataFrame of transactions
52
53
returns: DataFrame of daily prices
54
"""
55
groups = transactions[['date', 'ppg']].groupby('date')
56
daily = groups.aggregate(func)
57
58
daily['date'] = daily.index
59
start = daily.date[0]
60
one_year = np.timedelta64(1, 'Y')
61
daily['years'] = (daily.date - start) / one_year
62
63
return daily
64
65
66
def GroupByQualityAndDay(transactions):
67
"""Divides transactions by quality and computes mean daily price.
68
69
transaction: DataFrame of transactions
70
71
returns: map from quality to time series of ppg
72
"""
73
groups = transactions.groupby('quality')
74
dailies = {}
75
for name, group in groups:
76
dailies[name] = GroupByDay(group)
77
78
return dailies
79
80
81
def PlotDailies(dailies):
82
"""Makes a plot with daily prices for different qualities.
83
84
dailies: map from name to DataFrame
85
"""
86
thinkplot.PrePlot(rows=3)
87
for i, (name, daily) in enumerate(dailies.items()):
88
thinkplot.SubPlot(i+1)
89
title = 'price per gram ($)' if i == 0 else ''
90
thinkplot.Config(ylim=[0, 20], title=title)
91
thinkplot.Scatter(daily.ppg, s=10, label=name)
92
if i == 2:
93
pyplot.xticks(rotation=30)
94
else:
95
thinkplot.Config(xticks=[])
96
97
thinkplot.Save(root='timeseries1',
98
formats=FORMATS)
99
100
101
def RunLinearModel(daily):
102
"""Runs a linear model of prices versus years.
103
104
daily: DataFrame of daily prices
105
106
returns: model, results
107
"""
108
model = smf.ols('ppg ~ years', data=daily)
109
results = model.fit()
110
return model, results
111
112
113
def PlotFittedValues(model, results, label=''):
114
"""Plots original data and fitted values.
115
116
model: StatsModel model object
117
results: StatsModel results object
118
"""
119
years = model.exog[:, 1]
120
values = model.endog
121
thinkplot.Scatter(years, values, s=15, label=label)
122
thinkplot.Plot(years, results.fittedvalues, label='model')
123
124
125
def PlotResiduals(model, results):
126
"""Plots the residuals of a model.
127
128
model: StatsModel model object
129
results: StatsModel results object
130
"""
131
years = model.exog[:, 1]
132
thinkplot.Plot(years, results.resid, linewidth=0.5, alpha=0.5)
133
134
135
def PlotResidualPercentiles(model, results, index=1, num_bins=20):
136
"""Plots percentiles of the residuals.
137
138
model: StatsModel model object
139
results: StatsModel results object
140
index: which exogenous variable to use
141
num_bins: how many bins to divide the x-axis into
142
"""
143
exog = model.exog[:, index]
144
resid = results.resid.values
145
df = pandas.DataFrame(dict(exog=exog, resid=resid))
146
147
bins = np.linspace(np.min(exog), np.max(exog), num_bins)
148
indices = np.digitize(exog, bins)
149
groups = df.groupby(indices)
150
151
means = [group.exog.mean() for _, group in groups][1:-1]
152
cdfs = [thinkstats2.Cdf(group.resid) for _, group in groups][1:-1]
153
154
thinkplot.PrePlot(3)
155
for percent in [75, 50, 25]:
156
percentiles = [cdf.Percentile(percent) for cdf in cdfs]
157
label = '%dth' % percent
158
thinkplot.Plot(means, percentiles, label=label)
159
160
161
def SimulateResults(daily, iters=101, func=RunLinearModel):
162
"""Run simulations based on resampling residuals.
163
164
daily: DataFrame of daily prices
165
iters: number of simulations
166
func: function that fits a model to the data
167
168
returns: list of result objects
169
"""
170
_, results = func(daily)
171
fake = daily.copy()
172
173
result_seq = []
174
for _ in range(iters):
175
fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid)
176
_, fake_results = func(fake)
177
result_seq.append(fake_results)
178
179
return result_seq
180
181
182
def SimulateIntervals(daily, iters=101, func=RunLinearModel):
183
"""Run simulations based on different subsets of the data.
184
185
daily: DataFrame of daily prices
186
iters: number of simulations
187
func: function that fits a model to the data
188
189
returns: list of result objects
190
"""
191
result_seq = []
192
starts = np.linspace(0, len(daily), iters).astype(int)
193
194
for start in starts[:-2]:
195
subset = daily[start:]
196
_, results = func(subset)
197
fake = subset.copy()
198
199
for _ in range(iters):
200
fake.ppg = (results.fittedvalues +
201
thinkstats2.Resample(results.resid))
202
_, fake_results = func(fake)
203
result_seq.append(fake_results)
204
205
return result_seq
206
207
208
def GeneratePredictions(result_seq, years, add_resid=False):
209
"""Generates an array of predicted values from a list of model results.
210
211
When add_resid is False, predictions represent sampling error only.
212
213
When add_resid is True, they also include residual error (which is
214
more relevant to prediction).
215
216
result_seq: list of model results
217
years: sequence of times (in years) to make predictions for
218
add_resid: boolean, whether to add in resampled residuals
219
220
returns: sequence of predictions
221
"""
222
n = len(years)
223
d = dict(Intercept=np.ones(n), years=years, years2=years**2)
224
predict_df = pandas.DataFrame(d)
225
226
predict_seq = []
227
for fake_results in result_seq:
228
predict = fake_results.predict(predict_df)
229
if add_resid:
230
predict += thinkstats2.Resample(fake_results.resid, n)
231
predict_seq.append(predict)
232
233
return predict_seq
234
235
236
def GenerateSimplePrediction(results, years):
237
"""Generates a simple prediction.
238
239
results: results object
240
years: sequence of times (in years) to make predictions for
241
242
returns: sequence of predicted values
243
"""
244
n = len(years)
245
inter = np.ones(n)
246
d = dict(Intercept=inter, years=years, years2=years**2)
247
predict_df = pandas.DataFrame(d)
248
predict = results.predict(predict_df)
249
return predict
250
251
252
def PlotPredictions(daily, years, iters=101, percent=90, func=RunLinearModel):
253
"""Plots predictions.
254
255
daily: DataFrame of daily prices
256
years: sequence of times (in years) to make predictions for
257
iters: number of simulations
258
percent: what percentile range to show
259
func: function that fits a model to the data
260
"""
261
result_seq = SimulateResults(daily, iters=iters, func=func)
262
p = (100 - percent) / 2
263
percents = p, 100-p
264
265
predict_seq = GeneratePredictions(result_seq, years, add_resid=True)
266
low, high = thinkstats2.PercentileRows(predict_seq, percents)
267
thinkplot.FillBetween(years, low, high, alpha=0.3, color='gray')
268
269
predict_seq = GeneratePredictions(result_seq, years, add_resid=False)
270
low, high = thinkstats2.PercentileRows(predict_seq, percents)
271
thinkplot.FillBetween(years, low, high, alpha=0.5, color='gray')
272
273
274
def PlotIntervals(daily, years, iters=101, percent=90, func=RunLinearModel):
275
"""Plots predictions based on different intervals.
276
277
daily: DataFrame of daily prices
278
years: sequence of times (in years) to make predictions for
279
iters: number of simulations
280
percent: what percentile range to show
281
func: function that fits a model to the data
282
"""
283
result_seq = SimulateIntervals(daily, iters=iters, func=func)
284
p = (100 - percent) / 2
285
percents = p, 100-p
286
287
predict_seq = GeneratePredictions(result_seq, years, add_resid=True)
288
low, high = thinkstats2.PercentileRows(predict_seq, percents)
289
thinkplot.FillBetween(years, low, high, alpha=0.2, color='gray')
290
291
292
def Correlate(dailies):
293
"""Compute the correlation matrix between prices for difference qualities.
294
295
dailies: map from quality to time series of ppg
296
297
returns: correlation matrix
298
"""
299
df = pandas.DataFrame()
300
for name, daily in dailies.items():
301
df[name] = daily.ppg
302
303
return df.corr()
304
305
306
def CorrelateResid(dailies):
307
"""Compute the correlation matrix between residuals.
308
309
dailies: map from quality to time series of ppg
310
311
returns: correlation matrix
312
"""
313
df = pandas.DataFrame()
314
for name, daily in dailies.items():
315
_, results = RunLinearModel(daily)
316
df[name] = results.resid
317
318
return df.corr()
319
320
321
def TestCorrelateResid(dailies, iters=101):
322
"""Tests observed correlations.
323
324
dailies: map from quality to time series of ppg
325
iters: number of simulations
326
"""
327
328
t = []
329
names = ['high', 'medium', 'low']
330
for name in names:
331
daily = dailies[name]
332
t.append(SimulateResults(daily, iters=iters))
333
334
corr = CorrelateResid(dailies)
335
336
arrays = []
337
for result_seq in zip(*t):
338
df = pandas.DataFrame()
339
for name, results in zip(names, result_seq):
340
df[name] = results.resid
341
342
opp_sign = corr * df.corr() < 0
343
arrays.append((opp_sign.astype(int)))
344
345
print(np.sum(arrays))
346
347
348
def RunModels(dailies):
349
"""Runs linear regression for each group in dailies.
350
351
dailies: map from group name to DataFrame
352
"""
353
rows = []
354
for daily in dailies.values():
355
_, results = RunLinearModel(daily)
356
intercept, slope = results.params
357
p1, p2 = results.pvalues
358
r2 = results.rsquared
359
s = r'%0.3f (%0.2g) & %0.3f (%0.2g) & %0.3f \\'
360
row = s % (intercept, p1, slope, p2, r2)
361
rows.append(row)
362
363
# print results in a LaTeX table
364
print(r'\begin{tabular}{|c|c|c|}')
365
print(r'\hline')
366
print(r'intercept & slope & $R^2$ \\ \hline')
367
for row in rows:
368
print(row)
369
print(r'\hline')
370
print(r'\end{tabular}')
371
372
373
def FillMissing(daily, span=30):
374
"""Fills missing values with an exponentially weighted moving average.
375
376
Resulting DataFrame has new columns 'ewma' and 'resid'.
377
378
daily: DataFrame of daily prices
379
span: window size (sort of) passed to ewma
380
381
returns: new DataFrame of daily prices
382
"""
383
dates = pandas.date_range(daily.index.min(), daily.index.max())
384
reindexed = daily.reindex(dates)
385
386
ewma = pandas.ewma(reindexed.ppg, span=span)
387
388
resid = (reindexed.ppg - ewma).dropna()
389
fake_data = ewma + thinkstats2.Resample(resid, len(reindexed))
390
reindexed.ppg.fillna(fake_data, inplace=True)
391
392
reindexed['ewma'] = ewma
393
reindexed['resid'] = reindexed.ppg - ewma
394
return reindexed
395
396
397
def AddWeeklySeasonality(daily):
398
"""Adds a weekly pattern.
399
400
daily: DataFrame of daily prices
401
402
returns: new DataFrame of daily prices
403
"""
404
frisat = (daily.index.dayofweek==4) | (daily.index.dayofweek==5)
405
fake = daily.copy()
406
fake.ppg[frisat] += np.random.uniform(0, 2, frisat.sum())
407
return fake
408
409
410
def PrintSerialCorrelations(dailies):
411
"""Prints a table of correlations with different lags.
412
413
dailies: map from category name to DataFrame of daily prices
414
"""
415
filled_dailies = {}
416
for name, daily in dailies.items():
417
filled_dailies[name] = FillMissing(daily, span=30)
418
419
# print serial correlations for raw price data
420
for name, filled in filled_dailies.items():
421
corr = thinkstats2.SerialCorr(filled.ppg, lag=1)
422
print(name, corr)
423
424
rows = []
425
for lag in [1, 7, 30, 365]:
426
row = [str(lag)]
427
for name, filled in filled_dailies.items():
428
corr = thinkstats2.SerialCorr(filled.resid, lag)
429
row.append('%.2g' % corr)
430
rows.append(row)
431
432
print(r'\begin{tabular}{|c|c|c|c|}')
433
print(r'\hline')
434
print(r'lag & high & medium & low \\ \hline')
435
for row in rows:
436
print(' & '.join(row) + r' \\')
437
print(r'\hline')
438
print(r'\end{tabular}')
439
440
filled = filled_dailies['high']
441
acf = smtsa.acf(filled.resid, nlags=365, unbiased=True)
442
print('%0.3f, %0.3f, %0.3f, %0.3f, %0.3f' %
443
(acf[0], acf[1], acf[7], acf[30], acf[365]))
444
445
446
def SimulateAutocorrelation(daily, iters=1001, nlags=40):
447
"""Resample residuals, compute autocorrelation, and plot percentiles.
448
449
daily: DataFrame
450
iters: number of simulations to run
451
nlags: maximum lags to compute autocorrelation
452
"""
453
# run simulations
454
t = []
455
for _ in range(iters):
456
filled = FillMissing(daily, span=30)
457
resid = thinkstats2.Resample(filled.resid)
458
acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:]
459
t.append(np.abs(acf))
460
461
high = thinkstats2.PercentileRows(t, [97.5])[0]
462
low = -high
463
lags = list(range(1, nlags+1))
464
thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')
465
466
467
def PlotAutoCorrelation(dailies, nlags=40, add_weekly=False):
468
"""Plots autocorrelation functions.
469
470
dailies: map from category name to DataFrame of daily prices
471
nlags: number of lags to compute
472
add_weekly: boolean, whether to add a simulated weekly pattern
473
"""
474
thinkplot.PrePlot(3)
475
daily = dailies['high']
476
SimulateAutocorrelation(daily)
477
478
for name, daily in dailies.items():
479
480
if add_weekly:
481
daily = AddWeeklySeasonality(daily)
482
483
filled = FillMissing(daily, span=30)
484
485
acf = smtsa.acf(filled.resid, nlags=nlags, unbiased=True)
486
lags = np.arange(len(acf))
487
thinkplot.Plot(lags[1:], acf[1:], label=name)
488
489
490
def MakeAcfPlot(dailies):
491
"""Makes a figure showing autocorrelation functions.
492
493
dailies: map from category name to DataFrame of daily prices
494
"""
495
axis = [0, 41, -0.2, 0.2]
496
497
thinkplot.PrePlot(cols=2)
498
PlotAutoCorrelation(dailies, add_weekly=False)
499
thinkplot.Config(axis=axis,
500
loc='lower right',
501
ylabel='correlation',
502
xlabel='lag (day)')
503
504
thinkplot.SubPlot(2)
505
PlotAutoCorrelation(dailies, add_weekly=True)
506
thinkplot.Save(root='timeseries9',
507
axis=axis,
508
loc='lower right',
509
xlabel='lag (days)',
510
formats=FORMATS)
511
512
513
def PlotRollingMean(daily, name):
514
"""Plots rolling mean and EWMA.
515
516
daily: DataFrame of daily prices
517
"""
518
dates = pandas.date_range(daily.index.min(), daily.index.max())
519
reindexed = daily.reindex(dates)
520
521
thinkplot.PrePlot(cols=2)
522
thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
523
roll_mean = pandas.rolling_mean(reindexed.ppg, 30)
524
thinkplot.Plot(roll_mean, label='rolling mean')
525
pyplot.xticks(rotation=30)
526
thinkplot.Config(ylabel='price per gram ($)')
527
528
thinkplot.SubPlot(2)
529
thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
530
ewma = pandas.ewma(reindexed.ppg, span=30)
531
thinkplot.Plot(ewma, label='EWMA')
532
pyplot.xticks(rotation=30)
533
thinkplot.Save(root='timeseries10',
534
formats=FORMATS)
535
536
537
def PlotFilled(daily, name):
538
"""Plots the EWMA and filled data.
539
540
daily: DataFrame of daily prices
541
"""
542
filled = FillMissing(daily, span=30)
543
thinkplot.Scatter(filled.ppg, s=15, alpha=0.3, label=name)
544
thinkplot.Plot(filled.ewma, label='EWMA', alpha=0.4)
545
pyplot.xticks(rotation=30)
546
thinkplot.Save(root='timeseries8',
547
ylabel='price per gram ($)',
548
formats=FORMATS)
549
550
551
def PlotLinearModel(daily, name):
552
"""Plots a linear fit to a sequence of prices, and the residuals.
553
554
daily: DataFrame of daily prices
555
name: string
556
"""
557
model, results = RunLinearModel(daily)
558
PlotFittedValues(model, results, label=name)
559
thinkplot.Save(root='timeseries2',
560
title='fitted values',
561
xlabel='years',
562
xlim=[-0.1, 3.8],
563
ylabel='price per gram ($)',
564
formats=FORMATS)
565
566
PlotResidualPercentiles(model, results)
567
thinkplot.Save(root='timeseries3',
568
title='residuals',
569
xlabel='years',
570
ylabel='price per gram ($)',
571
formats=FORMATS)
572
573
#years = np.linspace(0, 5, 101)
574
#predict = GenerateSimplePrediction(results, years)
575
576
577
def main(name):
578
thinkstats2.RandomSeed(18)
579
transactions = ReadData()
580
581
dailies = GroupByQualityAndDay(transactions)
582
PlotDailies(dailies)
583
RunModels(dailies)
584
PrintSerialCorrelations(dailies)
585
MakeAcfPlot(dailies)
586
587
name = 'high'
588
daily = dailies[name]
589
590
PlotLinearModel(daily, name)
591
PlotRollingMean(daily, name)
592
PlotFilled(daily, name)
593
594
years = np.linspace(0, 5, 101)
595
thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
596
PlotPredictions(daily, years)
597
xlim = years[0]-0.1, years[-1]+0.1
598
thinkplot.Save(root='timeseries4',
599
title='predictions',
600
xlabel='years',
601
xlim=xlim,
602
ylabel='price per gram ($)',
603
formats=FORMATS)
604
605
name = 'medium'
606
daily = dailies[name]
607
608
thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
609
PlotIntervals(daily, years)
610
PlotPredictions(daily, years)
611
xlim = years[0]-0.1, years[-1]+0.1
612
thinkplot.Save(root='timeseries5',
613
title='predictions',
614
xlabel='years',
615
xlim=xlim,
616
ylabel='price per gram ($)',
617
formats=FORMATS)
618
619
620
if __name__ == '__main__':
621
import sys
622
main(*sys.argv)
623
624