Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96144
License: OTHER
1
import os
2
import gzip
3
import shutil
4
import struct
5
import urllib
6
7
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
8
9
from matplotlib import pyplot as plt
10
import numpy as np
11
import tensorflow as tf
12
13
def huber_loss(labels, predictions, delta=14.0):
14
residual = tf.abs(labels - predictions)
15
def f1(): return 0.5 * tf.square(residual)
16
def f2(): return delta * residual - 0.5 * tf.square(delta)
17
return tf.cond(residual < delta, f1, f2)
18
19
def safe_mkdir(path):
20
""" Create a directory if there isn't one already. """
21
try:
22
os.mkdir(path)
23
except OSError:
24
pass
25
26
def read_birth_life_data(filename):
27
"""
28
Read in birth_life_2010.txt and return:
29
data in the form of NumPy array
30
n_samples: number of samples
31
"""
32
text = open(filename, 'r').readlines()[1:]
33
data = [line[:-1].split('\t') for line in text]
34
births = [float(line[1]) for line in data]
35
lifes = [float(line[2]) for line in data]
36
data = list(zip(births, lifes))
37
n_samples = len(data)
38
data = np.asarray(data, dtype=np.float32)
39
return data, n_samples
40
41
def download_one_file(download_url,
42
local_dest,
43
expected_byte=None,
44
unzip_and_remove=False):
45
"""
46
Download the file from download_url into local_dest
47
if the file doesn't already exists.
48
If expected_byte is provided, check if
49
the downloaded file has the same number of bytes.
50
If unzip_and_remove is True, unzip the file and remove the zip file
51
"""
52
if os.path.exists(local_dest) or os.path.exists(local_dest[:-3]):
53
print('%s already exists' %local_dest)
54
else:
55
print('Downloading %s' %download_url)
56
local_file, _ = urllib.request.urlretrieve(download_url, local_dest)
57
file_stat = os.stat(local_dest)
58
if expected_byte:
59
if file_stat.st_size == expected_byte:
60
print('Successfully downloaded %s' %local_dest)
61
if unzip_and_remove:
62
with gzip.open(local_dest, 'rb') as f_in, open(local_dest[:-3],'wb') as f_out:
63
shutil.copyfileobj(f_in, f_out)
64
os.remove(local_dest)
65
else:
66
print('The downloaded file has unexpected number of bytes')
67
68
def download_mnist(path):
69
"""
70
Download and unzip the dataset mnist if it's not already downloaded
71
Download from http://yann.lecun.com/exdb/mnist
72
"""
73
safe_mkdir(path)
74
url = 'http://yann.lecun.com/exdb/mnist'
75
filenames = ['train-images-idx3-ubyte.gz',
76
'train-labels-idx1-ubyte.gz',
77
't10k-images-idx3-ubyte.gz',
78
't10k-labels-idx1-ubyte.gz']
79
expected_bytes = [9912422, 28881, 1648877, 4542]
80
81
for filename, byte in zip(filenames, expected_bytes):
82
download_url = os.path.join(url, filename)
83
local_dest = os.path.join(path, filename)
84
download_one_file(download_url, local_dest, byte, True)
85
86
def parse_data(path, dataset, flatten):
87
if dataset != 'train' and dataset != 't10k':
88
raise NameError('dataset must be train or t10k')
89
90
label_file = os.path.join(path, dataset + '-labels-idx1-ubyte')
91
with open(label_file, 'rb') as file:
92
_, num = struct.unpack(">II", file.read(8))
93
labels = np.fromfile(file, dtype=np.int8) #int8
94
new_labels = np.zeros((num, 10))
95
new_labels[np.arange(num), labels] = 1
96
97
img_file = os.path.join(path, dataset + '-images-idx3-ubyte')
98
with open(img_file, 'rb') as file:
99
_, num, rows, cols = struct.unpack(">IIII", file.read(16))
100
imgs = np.fromfile(file, dtype=np.uint8).reshape(num, rows, cols) #uint8
101
imgs = imgs.astype(np.float32) / 255.0
102
if flatten:
103
imgs = imgs.reshape([num, -1])
104
105
return imgs, new_labels
106
107
def read_mnist(path, flatten=True, num_train=55000):
108
"""
109
Read in the mnist dataset, given that the data is stored in path
110
Return two tuples of numpy arrays
111
((train_imgs, train_labels), (test_imgs, test_labels))
112
"""
113
imgs, labels = parse_data(path, 'train', flatten)
114
indices = np.random.permutation(labels.shape[0])
115
train_idx, val_idx = indices[:num_train], indices[num_train:]
116
train_img, train_labels = imgs[train_idx, :], labels[train_idx, :]
117
val_img, val_labels = imgs[val_idx, :], labels[val_idx, :]
118
test = parse_data(path, 't10k', flatten)
119
return (train_img, train_labels), (val_img, val_labels), test
120
121
def get_mnist_dataset(batch_size):
122
# Step 1: Read in data
123
mnist_folder = 'data/mnist'
124
download_mnist(mnist_folder)
125
train, val, test = read_mnist(mnist_folder, flatten=False)
126
127
# Step 2: Create datasets and iterator
128
train_data = tf.data.Dataset.from_tensor_slices(train)
129
train_data = train_data.shuffle(10000) # if you want to shuffle your data
130
train_data = train_data.batch(batch_size)
131
132
test_data = tf.data.Dataset.from_tensor_slices(test)
133
test_data = test_data.batch(batch_size)
134
135
return train_data, test_data
136
137
def show(image):
138
"""
139
Render a given numpy.uint8 2D array of pixel data.
140
"""
141
plt.imshow(image, cmap='gray')
142
plt.show()
143