CoCalc -- prep.py

📚 The CoCalc Library - books, templates and other resources
Project: 📚 The Library - Shared Public Version
Path: cocalc-examples / dask-tutorial / prep.py
Views: ⁹⁶¹⁰⁵
License: OTHER
1
import os
2
import numpy as np
3
import pandas as pd
4
from glob import glob
5
import tarfile
6
import urllib.request
7
import zipfile
8

9
here = os.path.dirname(__file__)
10

11
data_dir = os.path.abspath(os.path.join(here, 'data'))
12
if not os.path.exists(data_dir):
13
    raise OSError('data/ directory not found, aborting data preparation. ' \
14
                  'Restore it with "git checkout data" from the base ' \
15
                  'directory.')
16

17

18
def flights():
19
    flights_raw = os.path.join(data_dir, 'nycflights.tar.gz')
20
    flightdir = os.path.join(data_dir, 'nycflights')
21
    jsondir = os.path.join(data_dir, 'flightjson')
22

23
    if not os.path.exists(flights_raw):
24
        print("- Downloading NYC Flights dataset... ", end='', flush=True)
25
        url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"
26
        urllib.request.urlretrieve(url, flights_raw)
27
        print("done", flush=True)
28

29
    if not os.path.exists(flightdir):
30
        print("- Extracting flight data... ", end='', flush=True)
31
        tar_path = os.path.join(data_dir, 'nycflights.tar.gz')
32
        with tarfile.open(tar_path, mode='r:gz') as flights:
33
            flights.extractall('data/')
34
        print("done", flush=True)
35

36
    if not os.path.exists(jsondir):
37
        print("- Creating json data... ", end='', flush=True)
38
        os.mkdir(jsondir)
39
        for path in glob(os.path.join(data_dir, 'nycflights', '*.csv')):
40
            prefix = os.path.splitext(os.path.basename(path))[0]
41
            # Just take the first 10000 rows for the demo
42
            df = pd.read_csv(path).iloc[:10000]
43
            df.to_json(os.path.join(data_dir, 'flightjson', prefix + '.json'),
44
                       orient='records', lines=True)
45
        print("done", flush=True)
46

47
    print("** Finished! **")
48

49
def random_array():
50
    if os.path.exists(os.path.join(data_dir, 'random.hdf5')):
51
        return
52

53
    print("Create random data for array exercise")
54
    import h5py
55

56
    with h5py.File(os.path.join(data_dir, 'random.hdf5')) as f:
57
        dset = f.create_dataset('/x', shape=(1000000000,), dtype='f4')
58
        for i in range(0, 1000000000, 1000000):
59
            dset[i: i + 1000000] = np.random.exponential(size=1000000)
60

61

62
def accounts_csvs(num_files, n, k):
63
    from accounts import account_entries, account_params
64
    fn = os.path.join(data_dir, 'accounts.%d.csv' % (num_files - 1))
65

66
    if os.path.exists(fn):
67
        return
68

69
    print("Create CSV accounts for dataframe exercise")
70

71
    args = account_params(k)
72

73
    for i in range(num_files):
74
        df = account_entries(n, *args)
75
        df.to_csv(os.path.join(data_dir, 'accounts.%d.csv' % i),
76
                  index=False)
77

78

79
def accounts_json(num_files, n, k):
80
    from accounts import account_params, json_entries
81
    import json
82
    import gzip
83
    fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % (num_files - 1))
84
    if os.path.exists(fn):
85
        return
86

87
    print("Create JSON accounts for bag exercise")
88

89
    args = account_params(k)
90

91
    for i in range(num_files):
92
        seq = json_entries(n, *args)
93
        fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % i)
94
        with gzip.open(fn, 'wb') as f:
95
            f.write(os.linesep.join(map(json.dumps, seq)).encode())
96

97

98
def create_weather(growth=32):
99
    filenames = sorted(glob(os.path.join(data_dir, 'weather-small', '*.hdf5')))
100

101
    if not filenames:
102
        ws_dir = os.path.join(data_dir, 'weather-small')
103
        raise ValueError('Did not find any hdf5 files in {}'.format(ws_dir))
104

105
    if not os.path.exists(os.path.join(data_dir, 'weather-big')):
106
        os.mkdir(os.path.join(data_dir, 'weather-big'))
107

108
    if all(os.path.exists(fn.replace('small', 'big')) for fn in filenames):
109
        return
110

111
    from skimage.transform import resize
112
    import h5py
113

114
    print('Exploding weather data')
115
    for fn in filenames:
116
        with h5py.File(fn, mode='r') as f:
117
            x = f['/t2m'][:]
118

119
        y = resize(x, (x.shape[0] * 32, x.shape[1] * 32), mode='constant')
120

121
        out_fn = os.path.join(data_dir, 'weather-big', os.path.split(fn)[-1])
122

123
        try:
124
            with h5py.File(out_fn) as f:
125
                f.create_dataset('/t2m', data=y, chunks=(500, 500))
126
        except:
127
            pass
128

129

130
if __name__ == '__main__':
131
    import argparse
132

133
    parser = argparse.ArgumentParser(description='Downloads, generates and prepares data for the Dask tutorial.')
134
    parser.add_argument('--no-ssl-verify', dest='no_ssl_verify', action='store_true',
135
                        default=False, help='Disables SSL verification.')
136

137
    args = parser.parse_args()
138

139
    if (args.no_ssl_verify):
140
        print("- Disabling SSL Verification... ", end='', flush=True)
141
        import ssl
142
        ssl._create_default_https_context = ssl._create_unverified_context
143
        print("done", flush=True)
144

145
    random_array()
146
    create_weather()
147
    accounts_csvs(3, 1000000, 500)
148
    accounts_json(50, 100000, 500)
149
    flights()
150

151
Product

Resources

Company