Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96144
License: OTHER
1
from collections import Counter
2
import random
3
import os
4
import sys
5
sys.path.append('..')
6
import zipfile
7
8
import numpy as np
9
from six.moves import urllib
10
import tensorflow as tf
11
12
import utils
13
14
def read_data(file_path):
15
""" Read data into a list of tokens
16
There should be 17,005,207 tokens
17
"""
18
with zipfile.ZipFile(file_path) as f:
19
words = tf.compat.as_str(f.read(f.namelist()[0])).split()
20
return words
21
22
def build_vocab(words, vocab_size, visual_fld):
23
""" Build vocabulary of VOCAB_SIZE most frequent words and write it to
24
visualization/vocab.tsv
25
"""
26
utils.safe_mkdir(visual_fld)
27
file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')
28
29
dictionary = dict()
30
count = [('UNK', -1)]
31
index = 0
32
count.extend(Counter(words).most_common(vocab_size - 1))
33
34
for word, _ in count:
35
dictionary[word] = index
36
index += 1
37
file.write(word + '\n')
38
39
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
40
file.close()
41
return dictionary, index_dictionary
42
43
def convert_words_to_index(words, dictionary):
44
""" Replace each word in the dataset with its index in the dictionary """
45
return [dictionary[word] if word in dictionary else 0 for word in words]
46
47
def generate_sample(index_words, context_window_size):
48
""" Form training pairs according to the skip-gram model. """
49
for index, center in enumerate(index_words):
50
context = random.randint(1, context_window_size)
51
# get a random target before the center word
52
for target in index_words[max(0, index - context): index]:
53
yield center, target
54
# get a random target after the center wrod
55
for target in index_words[index + 1: index + context + 1]:
56
yield center, target
57
58
def most_common_words(visual_fld, num_visualize):
59
""" create a list of num_visualize most frequent words to visualize on TensorBoard.
60
saved to visualization/vocab_[num_visualize].tsv
61
"""
62
words = open(os.path.join(visual_fld, 'vocab.tsv'), 'r').readlines()[:num_visualize]
63
words = [word for word in words]
64
file = open(os.path.join(visual_fld, 'vocab_' + str(num_visualize) + '.tsv'), 'w')
65
for word in words:
66
file.write(word)
67
file.close()
68
69
def batch_gen(download_url, expected_byte, vocab_size, batch_size,
70
skip_window, visual_fld):
71
local_dest = 'data/text8.zip'
72
utils.download_one_file(download_url, local_dest, expected_byte)
73
words = read_data(local_dest)
74
dictionary, _ = build_vocab(words, vocab_size, visual_fld)
75
index_words = convert_words_to_index(words, dictionary)
76
del words # to save memory
77
single_gen = generate_sample(index_words, skip_window)
78
79
while True:
80
center_batch = np.zeros(batch_size, dtype=np.int32)
81
target_batch = np.zeros([batch_size, 1])
82
for index in range(batch_size):
83
center_batch[index], target_batch[index] = next(single_gen)
84
yield center_batch, target_batch
85
86