Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96105
License: OTHER
1
import os
2
import numpy as np
3
import pandas as pd
4
from glob import glob
5
import tarfile
6
import urllib.request
7
import zipfile
8
9
here = os.path.dirname(__file__)
10
11
data_dir = os.path.abspath(os.path.join(here, 'data'))
12
if not os.path.exists(data_dir):
13
raise OSError('data/ directory not found, aborting data preparation. ' \
14
'Restore it with "git checkout data" from the base ' \
15
'directory.')
16
17
18
def flights():
19
flights_raw = os.path.join(data_dir, 'nycflights.tar.gz')
20
flightdir = os.path.join(data_dir, 'nycflights')
21
jsondir = os.path.join(data_dir, 'flightjson')
22
23
if not os.path.exists(flights_raw):
24
print("- Downloading NYC Flights dataset... ", end='', flush=True)
25
url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"
26
urllib.request.urlretrieve(url, flights_raw)
27
print("done", flush=True)
28
29
if not os.path.exists(flightdir):
30
print("- Extracting flight data... ", end='', flush=True)
31
tar_path = os.path.join(data_dir, 'nycflights.tar.gz')
32
with tarfile.open(tar_path, mode='r:gz') as flights:
33
flights.extractall('data/')
34
print("done", flush=True)
35
36
if not os.path.exists(jsondir):
37
print("- Creating json data... ", end='', flush=True)
38
os.mkdir(jsondir)
39
for path in glob(os.path.join(data_dir, 'nycflights', '*.csv')):
40
prefix = os.path.splitext(os.path.basename(path))[0]
41
# Just take the first 10000 rows for the demo
42
df = pd.read_csv(path).iloc[:10000]
43
df.to_json(os.path.join(data_dir, 'flightjson', prefix + '.json'),
44
orient='records', lines=True)
45
print("done", flush=True)
46
47
print("** Finished! **")
48
49
def random_array():
50
if os.path.exists(os.path.join(data_dir, 'random.hdf5')):
51
return
52
53
print("Create random data for array exercise")
54
import h5py
55
56
with h5py.File(os.path.join(data_dir, 'random.hdf5')) as f:
57
dset = f.create_dataset('/x', shape=(1000000000,), dtype='f4')
58
for i in range(0, 1000000000, 1000000):
59
dset[i: i + 1000000] = np.random.exponential(size=1000000)
60
61
62
def accounts_csvs(num_files, n, k):
63
from accounts import account_entries, account_params
64
fn = os.path.join(data_dir, 'accounts.%d.csv' % (num_files - 1))
65
66
if os.path.exists(fn):
67
return
68
69
print("Create CSV accounts for dataframe exercise")
70
71
args = account_params(k)
72
73
for i in range(num_files):
74
df = account_entries(n, *args)
75
df.to_csv(os.path.join(data_dir, 'accounts.%d.csv' % i),
76
index=False)
77
78
79
def accounts_json(num_files, n, k):
80
from accounts import account_params, json_entries
81
import json
82
import gzip
83
fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % (num_files - 1))
84
if os.path.exists(fn):
85
return
86
87
print("Create JSON accounts for bag exercise")
88
89
args = account_params(k)
90
91
for i in range(num_files):
92
seq = json_entries(n, *args)
93
fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % i)
94
with gzip.open(fn, 'wb') as f:
95
f.write(os.linesep.join(map(json.dumps, seq)).encode())
96
97
98
def create_weather(growth=32):
99
filenames = sorted(glob(os.path.join(data_dir, 'weather-small', '*.hdf5')))
100
101
if not filenames:
102
ws_dir = os.path.join(data_dir, 'weather-small')
103
raise ValueError('Did not find any hdf5 files in {}'.format(ws_dir))
104
105
if not os.path.exists(os.path.join(data_dir, 'weather-big')):
106
os.mkdir(os.path.join(data_dir, 'weather-big'))
107
108
if all(os.path.exists(fn.replace('small', 'big')) for fn in filenames):
109
return
110
111
from skimage.transform import resize
112
import h5py
113
114
print('Exploding weather data')
115
for fn in filenames:
116
with h5py.File(fn, mode='r') as f:
117
x = f['/t2m'][:]
118
119
y = resize(x, (x.shape[0] * 32, x.shape[1] * 32), mode='constant')
120
121
out_fn = os.path.join(data_dir, 'weather-big', os.path.split(fn)[-1])
122
123
try:
124
with h5py.File(out_fn) as f:
125
f.create_dataset('/t2m', data=y, chunks=(500, 500))
126
except:
127
pass
128
129
130
if __name__ == '__main__':
131
import argparse
132
133
parser = argparse.ArgumentParser(description='Downloads, generates and prepares data for the Dask tutorial.')
134
parser.add_argument('--no-ssl-verify', dest='no_ssl_verify', action='store_true',
135
default=False, help='Disables SSL verification.')
136
137
args = parser.parse_args()
138
139
if (args.no_ssl_verify):
140
print("- Disabling SSL Verification... ", end='', flush=True)
141
import ssl
142
ssl._create_default_https_context = ssl._create_unverified_context
143
print("done", flush=True)
144
145
random_array()
146
create_weather()
147
accounts_csvs(3, 1000000, 500)
148
accounts_json(50, 100000, 500)
149
flights()
150
151