CoCalc Public Filesdeep-learning / sentiment-network / Sentiment_Classification_Projects.ipynb
Authors: William A. Stein, g f
Views : 247
Description: course sentiment classification
Compute Environment: Ubuntu 18.04 (Deprecated)
In [ ]:
In [2]:
import numpy as np import math rv = ['this movie is good', 'this movie is bad'] print('rv', rv) lb = ['pos','neg'] print ('lb', lb) seed = np.random.seed(1) print('seed', seed) voc = [] for item in rv: for word in item.split(' '): voc.append(word) voc_set = list(set(voc)) print('vocset', voc_set) windex = {} for idx, item in enumerate(voc_set): windex[item] = idx lab = [] for item in lb: for word in item.split(' '): lab.append(word) lab_set = list(set(lab)) print('labset', lab_set) labledex = {} for idx, item in enumerate(lab_set): labledex[item] = idx layer = np.zeros((1,len(voc_set))) def update_layer(rv): global layer layer *= 0 #print('layer', layer) for word in rv.split(' '): print("word", word) print('index', layer[0][windex[word]]) layer[0][windex[word]] =+ 1 #print(layer) hid_nod = [] update_layer(rv[1]) input_hid_w = np.zeros((len(voc_set), len(hid_nod))) #input_hid_weights = np.zeros([0,0,0,0,0]) hidden_output_weights = np.array([1,1,2,2,2]) output_node = np.array([hid_nod, hidden_output_weights]) output = 1 / (1 + math.exp(-output_node)) print (output) #forward pass """self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes)) # These are the weights between the hidden layer and the output layer. self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes)) # The input layer, a two-dimensional matrix with shape 1 x input_nodes self.layer_0 = np.zeros((1,input_nodes))"""
('rv', ['this movie is good', 'this movie is bad']) ('lb', ['pos', 'neg']) ('seed', None) ('vocset', ['this', 'movie', 'is', 'good', 'bad']) ('labset', ['neg', 'pos']) ('word', 'this') ('index', 0.0) ('word', 'movie') ('index', 0.0) ('word', 'is') ('index', 0.0) ('word', 'bad') ('index', 0.0)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-2-4c13531e2ad1> in <module>() 59 output_node = np.array([hid_nod, hidden_output_weights]) 60 ---> 61 output = Integer(1) / (Integer(1) + math.exp(-output_node)) 62 63 print (output) TypeError: bad operand type for unary -: 'list'

# Sentiment Classification & How To "Frame Problems" for a Neural Network

### What You Should Already Know

• neural networks, forward and back-propagation
• mean squared error
• and train/test splits

### Where to Get Help if You Need it

• Re-watch previous Udacity Lectures
• Leverage the recommended Course Reading Material - Grokking Deep Learning (Check inside your classroom for a discount code)
• Shoot me a tweet @iamtrask

# Lesson: Curate a Dataset

The cells from here until Project 1 include code Andrew shows in the videos leading up to mini project 1. We've included them so you can run the code along with the videos without having to type in everything.

In [ ]:
In [1]:
def pretty_print_review_and_label(i): print(labels[i] + "\t:\t" + reviews[i][:80] + "...") g = open('reviews.txt','r') # What we know! reviews = list(map(lambda x:x[:-1],g.readlines())) g.close() g = open('labels.txt','r') # What we WANT to know! labels = list(map(lambda x:x[:-1].upper(),g.readlines())) g.close()
In [2]:
25000 []

Note: The data in reviews.txt we're using has already been preprocessed a bit and contains only lower case characters. If we were working from raw data, where we didn't know it was all lower case, we would want to add a step here to convert it. That's so we treat different variations of the same word, like The, the, and THE, all the same way.

In [3]:
items = [1, 2, 3, 4, 5] y = lambda a: a + 10, y(5) #print(y) squared = list(map(lambda x: x**2, items)) print(squared)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-5ba910772d99> in <module>() 1 items = [Integer(1), Integer(2), Integer(3), Integer(4), Integer(5)] ----> 2 y = lambda a: a + Integer(10), y(Integer(5)) 3 4 #print(y) 5 NameError: name 'y' is not defined
In [ ]:
In [ ]:
len(reviews)
In [ ]:
reviews[0]
In [4]:
labels[0]
'POSITIVE'

# Lesson: Develop a Predictive Theory

In [ ]:
print("labels.txt \t : \t reviews.txt\n") pretty_print_review_and_label(2137) pretty_print_review_and_label(12816) pretty_print_review_and_label(6267) pretty_print_review_and_label(21934) pretty_print_review_and_label(5297) pretty_print_review_and_label(4998)

# Project 1: Quick Theory Validation

There are multiple ways to implement these projects, but in order to get your code closer to what Andrew shows in his solutions, we've provided some hints and starter code throughout this notebook.

You'll find the Counter class to be useful in this exercise, as well as the numpy library.

In [16]:
from collections import Counter import numpy as np

We'll create three Counter objects, one for words from postive reviews, one for words from negative reviews, and one for all the words.

In [17]:
# Create three Counter objects to store positive, negative and total counts positive_counts = Counter() negative_counts = Counter() total_counts = Counter()
In [ ]:
x = [1,2,2,3,3,3] Counter(x)

TODO: Examine all the reviews. For each word in a positive review, increase the count for that word in both your positive counter and the total words counter; likewise, for each word in a negative review, increase the count for that word in both your negative counter and the total words counter.

Note: Throughout these projects, you should use split(' ') to divide a piece of text (such as a review) into individual words. If you use split() instead, you'll get slightly different results than what the videos and solutions show.

In [ ]:
# TODO: Loop over all the words in all the reviews and increment the counts in the appropriate counter objects my_string = 'this is a string' #print(my_string.split()) #for i in my_string: # print (i) #for i in reviews[0:1], labels[0:1]: # print(i) #print(Counter(labels)) #print(labels[0:5]) #for word in labels[0:5]: # print (word) #for word in labels[0:2]: # print(Counter(word)) #print(total_counts) x = [1,2,3,4] y = ['a','b','c','d'] z = ['p', 'p', 'n'] for i in z: print (i == 'p') for i in x: print (y[i-1])
In [ ]:
# TODO: Loop over all the words in all the reviews and increment the counts in the appropriate counter objects bag = [] #print(Counter(reviews[1].split())) #for review in reviews: # for i in review.split(): # bag.append(i) print(labels[2] == 'POSITIVE') print(abels)) #for i in labels[0:1]: #print (labels) #print(Counter(bag))
In [ ]:
c=dict(zip(reviews, labels)) count = Counter() for key, values in c.items(): if values == 'POSITIVE': positive_counts += Counter(key.split(' ')) if values == 'NEGATIVE': negative_counts += Counter(key.split(' ')) total_counts = positive_counts + negative_counts print(total_counts)

Run the following two cells to list the words used in positive reviews and negative reviews, respectively, ordered from most to least commonly used.

In [ ]:
# Examine the counts of the most common words in positive reviews positive_counts.ratios.most_common()
In [ ]:
# Examine the counts of the most common words in negative reviews negative_counts.most_common()

As you can see, common words like "the" appear very often in both positive and negative reviews. Instead of finding the most common words in positive or negative reviews, what you really want are the words found in positive reviews more often than in negative reviews, and vice versa. To accomplish this, you'll need to calculate the ratios of word usage between positive and negative reviews.

TODO: Check all the words you've seen and calculate the ratio of postive to negative uses and store that ratio in pos_neg_ratios.

Hint: the positive-to-negative ratio for a given word can be calculated with positive_counts[word] / float(negative_counts[word]+1). Notice the +1 in the denominator – that ensures we don't divide by zero for words that are only seen in positive reviews.

In [15]:
# Create Counter object to store positive/negative ratios pos_neg_ratios = Counter() # TODO: Calculate the ratios of positive and negative uses of the most common words # Consider words to be "common" if they've been used at least 100 times
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-15-c13763d7f9e3> in <module>() 1 # Create Counter object to store positive/negative ratios ----> 2 pos_neg_ratios = Counter() 3 4 # TODO: Calculate the ratios of positive and negative uses of the most common words 5 # Consider words to be "common" if they've been used at least 100 times NameError: name 'Counter' is not defined

Examine the ratios you've calculated for a few words:

In [ ]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"])) print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"])) print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Looking closely at the values you just calculated, we see the following:

• Words that you would expect to see more often in positive reviews – like "amazing" – have a ratio greater than 1. The more skewed a word is toward postive, the farther from 1 its positive-to-negative ratio will be.
• Words that you would expect to see more often in negative reviews – like "terrible" – have positive values that are less than 1. The more skewed a word is toward negative, the closer to zero its positive-to-negative ratio will be.
• Neutral words, which don't really convey any sentiment because you would expect to see them in all sorts of reviews – like "the" – have values very close to 1. A perfectly neutral word – one that was used in exactly the same number of positive reviews as negative reviews – would be almost exactly 1. The +1 we suggested you add to the denominator slightly biases words toward negative, but it won't matter because it will be a tiny bias and later we'll be ignoring words that are too close to neutral anyway.

Ok, the ratios tell us which words are used more often in postive or negative reviews, but the specific values we've calculated are a bit difficult to work with. A very positive word like "amazing" has a value above 4, whereas a very negative word like "terrible" has a value around 0.18. Those values aren't easy to compare for a couple of reasons:

• Right now, 1 is considered neutral, but the absolute value of the postive-to-negative rations of very postive words is larger than the absolute value of the ratios for the very negative words. So there is no way to directly compare two numbers and see if one word conveys the same magnitude of positive sentiment as another word conveys negative sentiment. So we should center all the values around netural so the absolute value fro neutral of the postive-to-negative ratio for a word would indicate how much sentiment (positive or negative) that word conveys.
• When comparing absolute values it's easier to do that around zero than one.

To fix these issues, we'll convert all of our ratios to new values using logarithms.

TODO: Go through all the ratios you calculated and convert them to logarithms. (i.e. use np.log(ratio))

In the end, extremely positive and extremely negative words will have positive-to-negative ratios with similar magnitudes but opposite signs.

In [ ]:
# TODO: Convert ratios to logs

Examine the new ratios you've calculated for the same words from before:

In [ ]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"])) print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"])) print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

If everything worked, now you should see neutral words with values close to zero. In this case, "the" is near zero but slightly positive, so it was probably used in more positive reviews than negative reviews. But look at "amazing"'s ratio - it's above 1, showing it is clearly a word with positive sentiment. And "terrible" has a similar score, but in the opposite direction, so it's below -1. It's now clear that both of these words are associated with specific, opposing sentiments.

Now run the following cells to see more ratios.

The first cell displays all the words, ordered by how associated they are with postive reviews. (Your notebook will most likely truncate the output so you won't actually see all the words in the list.)

The second cell displays the 30 words most associated with negative reviews by reversing the order of the first list and then looking at the first 30 words. (If you want the second cell to display all the words, ordered by how associated they are with negative reviews, you could just write reversed(pos_neg_ratios.most_common()).)

You should continue to see values similar to the earlier ones we checked – neutral words will be close to 0, words will get more positive as their ratios approach and go above 1, and words will get more negative as their ratios approach and go below -1. That's why we decided to use the logs instead of the raw ratios.

In [ ]:
# words most frequently seen in a review with a "POSITIVE" label pos_neg_ratios.most_common()
In [ ]:
# words most frequently seen in a review with a "NEGATIVE" label list(reversed(pos_neg_ratios.most_common()))[0:30] # Note: Above is the code Andrew uses in his solution video, # so we've included it here to avoid confusion. # If you explore the documentation for the Counter class, # you will see you could also find the 30 least common # words like this: pos_neg_ratios.most_common()[:-31:-1]

# Transforming Text into Numbers

The cells here include code Andrew shows in the next video. We've included it so you can run the code along with the video without having to type in everything.

In [ ]:
from IPython.display import Image review = "This was a horrible, terrible movie." Image(filename='sentiment_network.png')
In [ ]:
review = "The movie was excellent" Image(filename='sentiment_network_pos.png')

# Project 2: Creating the Input/Output Data

TODO: Create a set named vocab that contains every word in the vocabulary.

In [ ]:
# TODO: Create set named "vocab" containing all of the words from all of the reviews #unique_words = set(reviews.split(" ")) vocab = [] for i in reviews: vocab += i.split(" ") vocab = set(vocab) #print(vocab) #for word in unique_words: # print (word) #file.write(str(word) + "\n")

Run the following cell to check your vocabulary size. If everything worked correctly, it should print 74074

In [ ]:
vocab_size = len(vocab) print(vocab_size)

Take a look at the following image. It represents the layers of the neural network you'll be building throughout this notebook. layer_0 is the input layer, layer_1 is a hidden layer, and layer_2 is the output layer.

In [11]:
from IPython.display import Image Image(filename='sentiment_network_2.png')

TODO: Create a numpy array called layer_0 and initialize it to all zeros. You will find the zeros function particularly helpful here. Be sure you create layer_0 as a 2-dimensional matrix with 1 row and vocab_size columns.

In [ ]:
# TODO: Create layer_0 matrix with dimensions 1 by vocab_size, initially filled with zeros layer_0 = np.zeros((1, len(vocab)))

Run the following cell. It should display (1, 74074)

In [ ]:
layer_0.shape
In [ ]:
from IPython.display import Image Image(filename='sentiment_network.png')

layer_0 contains one entry for every word in the vocabulary, as shown in the above image. We need to make sure we know the index of each word, so run the following cell to create a lookup table that stores the index of every word.

In [ ]:
# Create a dictionary of words in the vocabulary mapped to index positions # (to be used in layer_0) word2index = {} for i,word in enumerate(vocab): word2index[word] = i # display the map of words to indices
In [ ]:
bag = [] for vocab in reviews[2]: print(vocab) #bag += vocab.split(" ") print(bag)
In [ ]:
mylist = [1, 1, 2, 3, 3, 3, 4, 4, 4, 4] mylist = [2] mydict = {} for i in mylist: print(i) if i in mydict: print(mydict) mydict[i] += 1 else: mydict[i] = 1 mytups = [(i, mydict[i]) for i in mydict] print(mytups)
In [ ]:
sentences = ["The book was awesome and envious", "movie was fantastic"] dictionary = ["awesome","amazing", "fantastic","envious"] bag = [[0,0],[]] bag[0][1] = 1 print(bag) dicindex = {} for i,word in enumerate(dictionary): bag[0][dicindex[word]] =+ 1 for word in dicindex: if word in sentence: bag[idx].append(word) for idx,sentence in enumerate(sentences): print (sentence) for word in dictionary: if word in sentence: print (word) bag[idx].append(word)""" #print(bag) bag = [[0,0],[]] bag[0][1] = 1
In [ ]:
bag *= 0 bag[1][1] = 1 print(bag)
In [ ]:
for word in reviews[0:3]: print (word)

TODO: Complete the implementation of update_input_layer. It should count how many times each word is used in the given review, and then store those counts at the appropriate indices inside layer_0.

In [ ]:
In [12]:
def update_input_layer(review): """ Modify the global layer_0 to represent the vector form of review. The element at a given index of layer_0 should represent how many times the given word occurs in the review. """ global layer_0 # clear out previous state by resetting the layer to be all 0s layer_0 *= 0 for word in review.split(" "): layer_0[0][word2index[word]] += 1 update_input_layer(reviews[0])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-12-7a10af6876aa> in <module>() 17 18 ---> 19 update_input_layer(reviews[Integer(0)]) <ipython-input-12-7a10af6876aa> in update_input_layer(review) 6 global layer_0 7 # clear out previous state by resetting the layer to be all 0s ----> 8 layer_0 *= Integer(0) 9 10 for word in review.split(" "): NameError: global name 'layer_0' is not defined
In [ ]:
vocabl = list(vocab) print(vocabl[0:4]) print(reviews[0:1]) findthis = 'ran' for word in reviews[0:1]: if findthis in word: print ("s")
In [ ]:
x = [[] for i in range(3)] print(x) print(reviews[1]) for i in range(2): for item in reviews[i].split(' '): #print (item) #for word in vocabl: #print (word) if item in word2index.item(): #print(item) bag[i].append(item) #for word in review.split(" "): #layer_0[idx].append(word)
In [ ]:
bag *= 0 tot = ['a', 'b'] dic = {'a':1, 'b': 2, 'c':3} #for i in tot: if tot in dic: bag.append(dic[i]) print (bag)
In [ ]:
bag *= 0 for i in range(2): for item in reviews[i].split(' '): bag[word2index[item]] #print (item) #for word in vocabl: #print (word) #if item in word2index: #print(item) #bag[i].append(item) #for word in review.split(" "): #layer_0[idx].append(word) print (bag[1]) #print(layer_0) #print(1+2)
In [ ]:
In [ ]:
In [ ]:
print(len(bag)) help(Counter)
In [ ]:
x= [['a', 'b'], ['c', 'a', 'b', 'b']] count = {} for i in x: count[] #count = Counter(bag)

Run the following cell to test updating the input layer with the first review. The indices assigned may not be the same as in the solution, but hopefully you'll see some non-zero values in layer_0.

In [ ]:
update_input_layer(reviews) print(layer_0)

TODO: Complete the implementation of get_target_for_labels. It should return 0 or 1, depending on whether the given label is NEGATIVE or POSITIVE, respectively.

In [17]:
def get_target_for_label(label): """Convert a label to 0 or 1. Args: label(string) - Either "POSITIVE" or "NEGATIVE". Returns: 0 or 1. """ print (label) for word in label.split(' '): print (word) if word == 'POSITIVE': return 1 elif word == 'NEGATIVE': return 0

Run the following two cells. They should print out'POSITIVE' and 1, respectively.

In [12]:
labels[0]
'POSITIVE'
In [18]:
get_target_for_label(labels[0])
POSITIVE POSITIVE
1

Run the following two cells. They should print out 'NEGATIVE' and 0, respectively.

In [ ]:
labels[1]
In [ ]:
get_target_for_label(labels[1])

# Project 3: Building a Neural Network

TODO: We've included the framework of a class called SentimentNetork. Implement all of the items marked TODO in the code. These include doing the following:

• Create a basic neural network much like the networks you've seen in earlier lessons and in Project 1, with an input layer, a hidden layer, and an output layer.
• Do not add a non-linearity in the hidden layer. That is, do not use an activation function when calculating the hidden layer outputs.
• Re-use the code from earlier in this notebook to create the training data (see TODOs in the code)
• Implement the pre_process_data function to create the vocabulary for our training data generating functions
• Ensure train trains over the entire corpus

### Where to Get Help if You Need it

• Re-watch earlier Udacity lectures
• Chapters 3-5 - Grokking Deep Learning - (Check inside your classroom for a discount code)
In [ ]:
In [ ]:
inp = [2,3] w1 = [1,1] w2 = [-1,1] hid = [4,5]
In [6]:
import numpy as np class sn: def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1): # Assign a seed to our random number generator to ensure we get # reproducable results during development np.random.seed(1) # process the reviews and their associated labels so that everything # is ready for training self.pre_process_data(reviews, labels) # Build the network to have the number of hidden nodes and the learning rate that # were passed into this initializer. Make the same number of input nodes as # there are vocabulary words and create a single output node. ######self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate) def pre_process_data(self, reviews, labels): review_vocab = set() # TODO: populate review_vocab with all of the words in the given reviews # Remember to split reviews into individual words # using "split(' ')" instead of "split()". # Convert the vocabulary set to a list so we can access words via indices review_vocab = [] for item in reviews: for word in item.split(' '): review_vocab.append(word) review_vocab = set(review_vocab) self.review_vocab = list(review_vocab) label_vocab = set() # TODO: populate label_vocab with all of the words in the given labels. # There is no need to split the labels because each one is a single word. for word in labels: label_vocab.add(word) # Convert the label vocabulary set to a list so we can access labels via indices self.label_vocab = list(label_vocab) # Store the sizes of the review and label vocabularies. self.review_vocab_size = len(self.review_vocab) self.label_vocab_size = len(self.label_vocab) # Create a dictionary of words in the vocabulary mapped to index positions self.word2index = {} # TODO: populate self.word2index with indices for all the words in self.review_vocab # like you saw earlier in the notebook for idx, item in enumerate(self.review_vocab): self.word2index[item] = idx print('word {}'.format(self.word2index)) # Create a dictionary of labels mapped to index positions self.label2index = {} # TODO: do same thing you did for self.word2index and self.review_vocab, # but for self.label2index and self.label_vocab instead for idx, item in enumerate(self.label_vocab): self.label2index[item] = idx print(self.label2index) sn(reviews[0:2], labels[0:2], learning_rate=0.1)
word {'': 0, 'all': 1, 'just': 2, 'years': 3, 'violent': 4, 'financially': 5, 'through': 6, 'mob': 7, 'forrest': 8, 'knew': 148, 'should': 9, 'better': 10, 'to': 11, 'easy': 12, 'teaching': 13, 'has': 14, 'might': 15, 'eventually': 16, 'good': 17, 'far': 18, 'frederic': 19, 'putting': 20, 'stays': 21, 'schools': 22, 'immediately': 23, 'feelings': 24, 'formal': 25, 'school': 26, 'comedy': 143, 'level': 27, 'remind': 28, 'audience': 29, 'grader': 30, 't': 31, 'insane': 33, 'right': 34, 'bromwell': 35, 'some': 36, 'see': 38, 'expect': 39, 'out': 40, 'closer': 41, 'what': 42, 'lead': 43, 'unnatural': 44, 'orchestra': 45, 'shakespeare': 46, 'kirkland': 48, 'era': 65, 'be': 50, 'welcome': 37, 'pathetic': 52, 'opening': 53, 'burn': 54, 'here': 55, 'situation': 112, 'singers': 57, 'by': 58, 'pity': 59, 'on': 60, 'about': 61, 'cryptic': 62, 'technical': 63, 'of': 64, 'scramble': 114, 's': 67, 'isn': 68, 'whole': 69, 'think': 70, 'fetched': 71, 'can': 110, 'into': 72, 'profession': 73, 'scene': 74, 'sack': 75, 'one': 76, 'down': 77, 'your': 78, 'story': 79, 'from': 80, 'would': 81, 'zsigmond': 82, 'satire': 83, '.': 84, 'their': 85, 'much': 86, 'too': 87, 'stars': 88, 'time': 164, 'survive': 89, 'insightful': 90, 'life': 91, 'pomp': 92, 'that': 93, 'great': 94, 'it': 122, 'recalled': 95, 'repeatedly': 96, 'episode': 155, 'line': 98, 'believe': 99, 'with': 100, 'than': 101, 'those': 102, 'me': 103, 'unfortunately': 104, 'vilmos': 105, 'pettiness': 106, 'future': 109, 'teachers': 49, 'starts': 107, 'making': 56, 'my': 113, 'example': 66, 'and': 115, 'crazy': 116, 'adults': 117, 'classic': 118, 'ran': 119, 'is': 120, 'turned': 121, 'dialogue': 108, 'an': 32, 'high': 124, 'as': 125, 'at': 126, 'in': 127, 'seen': 128, 'inspector': 129, 'saw': 130, 'seem': 131, 'no': 132, 'general': 133, 'make': 134, 'when': 135, 'same': 136, 'reality': 137, 'terrific': 138, 'cinematography': 139, 'other': 140, 'which': 141, 'you': 142, 'many': 111, 'even': 144, 'briefly': 145, 'pig': 123, 'tried': 146, 'students': 147, 'who': 51, 'student': 149, 'such': 150, 'sally': 151, 'cartoon': 152, 'man': 153, 'a': 154, 'off': 97, 'for': 156, 'third': 157, 'programs': 158, 'i': 159, 'age': 160, 'm': 161, 'chantings': 162, 'absurd': 163, 'narrative': 47, 'the': 165} {'POSITIVE': 0, 'NEGATIVE': 1}
<__main__.sn instance at 0x7f500d6bba28>
In [4]:
import numpy as np rv = ['this movie is good', 'this movie is bad'] print['rv', rv] lb = ['pos','neg'] print ('lb', lb) seed = np.random.seed(1) print('seed', seed) voc = [] for item in rv: for word in item.split(' '): voc.append(word) voc_set = list(set(voc)) print('vocset', voc_set) windex = {} for idx, item in enumerate(voc_set): windex[item] = idx lab = [] for item in lb: for word in item.split(' '): lab.append(word) lab_set = list(set(lab)) print('labset', lab_set) labledex = {} for idx, item in enumerate(lab_set): labledex[item] = idx layer = np.zeros((1,len(voc_set))) def update_layer(rv): global layer layer *= 0 print('layer', layer) for word in rv.split(' '): print("word", word) print('index', layer[0][windex[word]]) layer[0][windex[word]] =+ 1 print(layer) hid_nodes = [] print(update_layer(rv[1])) input_hid_w = np.zeros(len(voc_set), (hid_nodes)) #input_hid_weights = np.zeros([0,0,0,0,0]) hidden_output_weights = np.array([1,1,2,2,2]) output_node = np.array(hid_nod, hidden_output_weights) output = 1 / (1 + e^(-output_node)) print (output) #forward pass """self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes)) # These are the weights between the hidden layer and the output layer. self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes)) # The input layer, a two-dimensional matrix with shape 1 x input_nodes self.layer_0 = np.zeros((1,input_nodes))"""
['rv', ['this movie is good', 'this movie is bad']] ('lb', ['pos', 'neg']) ('seed', None) ('vocset', ['this', 'movie', 'is', 'good', 'bad']) ('labset', ['neg', 'pos']) ('layer', array([[ 0., 0., 0., 0., 0.]])) ('word', 'this') ('index', 0.0) ('word', 'movie') ('index', 0.0) ('word', 'is') ('index', 0.0) ('word', 'bad') ('index', 0.0) [[ 1. 1. 1. 0. 1.]] None
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-ae2dac294736> in <module>() 56 hidden_output_weights = np.array([Integer(1),Integer(1),Integer(2),Integer(2),Integer(2)]) 57 ---> 58 output_node = np.array(hid_nod, hidden_output_weights) 59 60 output = Integer(1) / (Integer(1) + e**(-output_node)) NameError: name 'hid_nod' is not defined
In [ ]:
In [19]:
import time import sys import numpy as np # Encapsulate our neural network in a class class sn: def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1): """Create a SentimenNetwork with the given settings Args: reviews(list) - List of reviews used for training labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews hidden_nodes(int) - Number of nodes to create in the hidden layer learning_rate(float) - Learning rate to use while training """ # Assign a seed to our random number generator to ensure we get # reproducable results during development np.random.seed(1) # process the reviews and their associated labels so that everything # is ready for training self.pre_process_data(reviews, labels) # Build the network to have the number of hidden nodes and the learning rate that # were passed into this initializer. Make the same number of input nodes as # there are vocabulary words and create a single output node. self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate) def pre_process_data(self, reviews, labels): review_vocab = set() # TODO: populate review_vocab with all of the words in the given reviews # Remember to split reviews into individual words # using "split(' ')" instead of "split()". # Convert the vocabulary set to a list so we can access words via indices review_vocab = [] for item in reviews: for word in item.split(' '): review_vocab.append(word) review_vocab = set(review_vocab) self.review_vocab = list(review_vocab) label_vocab = set() # TODO: populate label_vocab with all of the words in the given labels. # There is no need to split the labels because each one is a single word. for word in labels: label_vocab.add(word) # Convert the label vocabulary set to a list so we can access labels via indices self.label_vocab = list(label_vocab) # Store the sizes of the review and label vocabularies. self.review_vocab_size = len(self.review_vocab) self.label_vocab_size = len(self.label_vocab) # Create a dictionary of words in the vocabulary mapped to index positions self.word2index = {} # TODO: populate self.word2index with indices for all the words in self.review_vocab # like you saw earlier in the notebook for idx, item in enumerate(self.label_vocab): self.word2index[item] = idx # Create a dictionary of labels mapped to index positions self.label2index = {} # TODO: do same thing you did for self.word2index and self.review_vocab, # but for self.label2index and self.label_vocab instead for idx, item in enumerate(self.label_vocab): self.label2index[item] = idx print('label{}, words{}'.format(self.label2index, self.label2index)) def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate): # Store the number of nodes in input, hidden, and output layers. self.input_nodes = input_nodes self.hidden_nodes = hidden_nodes self.output_nodes = output_nodes # Store the learning rate self.learning_rate = learning_rate # Initialize weights # TODO: initialize self.weights_0_1 as a matrix of zeros. These are the weights between # the input layer and the hidden layer. self.weights_0_1 = np.zeros(len(self.input_nodes), len(self.hidden_nodes)) # TODO: initialize self.weights_1_2 as a matrix of random values. # These are the weights between the hidden layer and the output layer. self.weights_1_2 = np.random.rand(self.hidden_nodes, self.output_nodes) # TODO: Create the input layer, a two-dimensional matrix with shape # 1 x input_nodes, with all values initialized to zero self.layer_0 = np.zeros((1,input_nodes)) def update_input_layer(self,review): # TODO: You can copy most of the code you wrote for update_input_layer # earlier in this notebook. # # However, MAKE SURE YOU CHANGE ALL VARIABLES TO REFERENCE # THE VERSIONS STORED IN THIS OBJECT, NOT THE GLOBAL OBJECTS. # For example, replace "layer_0 *= 0" with "self.layer_0 *= 0" global = self.layer_0 self.layer_0 *= 0 for i in review.split(' '): self.layer_0[0][self.word2index[word]] + 1 def get_target_for_label(self,label): # TODO: Copy the code you wrote for get_target_for_label # earlier in this notebook. for word in label.split(' '): print (word) if word == 'POSITIVE': return 1 elif word == 'NEGATIVE': return 0 def sigmoid(self,x): # TODO: Return the result of calculating the sigmoid activation function # shown in the lectures return (1 / (1 + np.exp(-x))) def sigmoid_output_2_derivative(self,output): # TODO: Return the derivative of the sigmoid activation function, # where "output" is the original output from the sigmoid fucntion output = sigmoid(x) return output*(1-ouput) def train(self, training_reviews, training_labels): # make sure out we have a matching number of reviews and labels assert(len(training_reviews) == len(training_labels)) # Keep track of correct predictions to display accuracy during training correct_so_far = 0 # Remember when we started for printing time statistics start = time.time() # loop through all the given reviews and run a forward and backward pass, # updating weights for every item for i in range(len(training_reviews)): # TODO: Get the next review and its correct label # TODO: Implement the forward pass through the network. # That means use the given review to update the input layer, # then calculate values for the hidden layer, # and finally calculate the output layer. # # Do not use an activation function for the hidden layer, # but use the sigmoid activation function for the output layer. self.update_input_layer(review) # TODO: Implement the back propagation pass here. # That means calculate the error for the forward pass's prediction # and update the weights in the network according to their # contributions toward the error, as calculated via the # gradient descent and back propagation algorithms you # learned in class. # TODO: Keep track of correct predictions. To determine if the prediction was # correct, check that the absolute value of the output error # is less than 0.5. If so, add one to the correct_so_far count. # For debug purposes, print out our prediction accuracy and speed # throughout the training process. elapsed_time = float(time.time() - start) reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0 sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \ + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \ + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \ + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%") if(i % 2500 == 0): print("") def test(self, testing_reviews, testing_labels): """ Attempts to predict the labels for the given testing_reviews, and uses the test_labels to calculate the accuracy of those predictions. """ # keep track of how many correct predictions we make correct = 0 # we'll time how many predictions per second we make start = time.time() # Loop through each of the given reviews and call run to predict # its label. for i in range(len(testing_reviews)): pred = self.run(testing_reviews[i]) if(pred == testing_labels[i]): correct += 1 # For debug purposes, print out our prediction accuracy and speed # throughout the prediction process. elapsed_time = float(time.time() - start) reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0 sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \ + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \ + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \ + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%") def run(self, review): """ Returns a POSITIVE or NEGATIVE prediction for the given review. """ # TODO: Run a forward pass through the network, like you did in the # "train" function. That means use the given review to # update the input layer, then calculate values for the hidden layer, # and finally calculate the output layer. # # Note: The review passed into this function for prediction # might come from anywhere, so you should convert it # to lower case prior to using it. # TODO: The output layer should now contain a prediction. # Return POSITIVE for predictions greater-than-or-equal-to 0.5, # and NEGATIVE otherwise. pass sn(reviews[0:2], labels[0:2], learning_rate = 0.1)
File "<ipython-input-19-027eaa6a79d8>", line 116 global = self.layer_0 ^ SyntaxError: invalid syntax
In [20]:
class SentimentNetwork: def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1)
File "<ipython-input-20-ffb3870283c4>", line 2 def __init__(self, reviews, labels, hidden_nodes = Integer(10), learning_rate = RealNumber('0.1')) ^ SyntaxError: invalid syntax

Run the following cell to create a SentimentNetwork that will train on all but the last 1000 reviews (we're saving those for testing). Here we use a learning rate of 0.1.

In [21]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-21-dce77304de86> in <module>() ----> 1 mlp = SentimentNetwork(reviews[:-Integer(1000)],labels[:-Integer(1000)], learning_rate=RealNumber('0.1')) NameError: name 'SentimentNetwork' is not defined

Run the following cell to test the network's performance against the last 1000 reviews (the ones we held out from our training set).

We have not trained the model yet, so the results should be about 50% as it will just be guessing and there are only two possible values to choose from.

In [ ]:
mlp.test(reviews[-1000:],labels[-1000:])

Run the following cell to actually train the network. During training, it will display the model's accuracy repeatedly as it trains so you can see how well it's doing.

In [ ]:
mlp.train(reviews[:-1000],labels[:-1000])

That most likely didn't train very well. Part of the reason may be because the learning rate is too high. Run the following cell to recreate the network with a smaller learning rate, 0.01, and then train the new network.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.01) mlp.train(reviews[:-1000],labels[:-1000])

That probably wasn't much different. Run the following cell to recreate the network one more time with an even smaller learning rate, 0.001, and then train the new network.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.001) mlp.train(reviews[:-1000],labels[:-1000])

With a learning rate of 0.001, the network should finall have started to improve during training. It's still not very good, but it shows that this solution has potential. We will improve it in the next lesson.

# Understanding Neural Noise

The following cells include includes the code Andrew shows in the next video. We've included it here so you can run the cells along with the video without having to type in everything.

In [ ]:
from IPython.display import Image Image(filename='sentiment_network.png')
In [ ]:
def update_input_layer(review): global layer_0 # clear out previous state, reset the layer to be all 0s layer_0 *= 0 for word in review.split(" "): layer_0[0][word2index[word]] += 1 update_input_layer(reviews[0])
In [ ]:
layer_0
In [ ]:
review_counter = Counter()
In [ ]:
for word in reviews[0].split(" "): review_counter[word] += 1
In [ ]:
review_counter.most_common()

# Project 4: Reducing Noise in Our Input Data

TODO: Attempt to reduce the noise in the input data like Andrew did in the previous video. Specifically, do the following:

• Copy the SentimentNetwork class you created earlier into the following cell.
• Modify update_input_layer so it does not count how many times each word is used, but rather just stores whether or not a word was used.
In [ ]:
# TODO: -Copy the SentimentNetwork class from Projet 3 lesson # -Modify it to reduce noise, like in the video

Run the following cell to recreate the network and train it. Notice we've gone back to the higher learning rate of 0.1.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1) mlp.train(reviews[:-1000],labels[:-1000])

That should have trained much better than the earlier attempts. It's still not wonderful, but it should have improved dramatically. Run the following cell to test your model with 1000 predictions.

In [ ]:
mlp.test(reviews[-1000:],labels[-1000:])

# Analyzing Inefficiencies in our Network

The following cells include the code Andrew shows in the next video. We've included it here so you can run the cells along with the video without having to type in everything.

In [ ]:
Image(filename='sentiment_network_sparse.png')
In [ ]:
layer_0 = np.zeros(10)
In [ ]:
layer_0
In [ ]:
layer_0[4] = 1 layer_0[9] = 1
In [ ]:
layer_0
In [ ]:
weights_0_1 = np.random.randn(10,5)
In [ ]:
layer_0.dot(weights_0_1)
In [ ]:
indices = [4,9]
In [ ]:
layer_1 = np.zeros(5)
In [ ]:
for index in indices: layer_1 += (1 * weights_0_1[index])
In [ ]:
layer_1
In [ ]:
Image(filename='sentiment_network_sparse_2.png')
In [ ]:
layer_1 = np.zeros(5)
In [ ]:
for index in indices: layer_1 += (weights_0_1[index])
In [ ]:
layer_1

# Project 5: Making our Network More Efficient

TODO: Make the SentimentNetwork class more efficient by eliminating unnecessary multiplications and additions that occur during forward and backward propagation. To do that, you can do the following:

• Copy the SentimentNetwork class from the previous project into the following cell.
• Remove the update_input_layer function - you will not need it in this version.
• Modify init_network:
• You no longer need a separate input layer, so remove any mention of self.layer_0
• You will be dealing with the old hidden layer more directly, so create self.layer_1, a two-dimensional matrix with shape 1 x hidden_nodes, with all values initialized to zero
• Modify train:
• Change the name of the input parameter training_reviews to training_reviews_raw. This will help with the next step.
• At the beginning of the function, you'll want to preprocess your reviews to convert them to a list of indices (from word2index) that are actually used in the review. This is equivalent to what you saw in the video when Andrew set specific indices to 1. Your code should create a local list variable named training_reviews that should contain a list for each review in training_reviews_raw. Those lists should contain the indices for words found in the review.
• Remove call to update_input_layer
• Use self's layer_1 instead of a local layer_1 object.
• In the forward pass, replace the code that updates layer_1 with new logic that only adds the weights for the indices used in the review.
• When updating weights_0_1, only update the individual weights that were used in the forward pass.
• Modify run:
• Remove call to update_input_layer
• Use self's layer_1 instead of a local layer_1 object.
• Much like you did in train, you will need to pre-process the review so you can work with word indices, then update layer_1 by adding weights for the indices used in the review.
In [ ]:
# TODO: -Copy the SentimentNetwork class from Project 4 lesson # -Modify it according to the above instructions

Run the following cell to recreate the network and train it once again.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1) mlp.train(reviews[:-1000],labels[:-1000])

That should have trained much better than the earlier attempts. Run the following cell to test your model with 1000 predictions.

In [ ]:
mlp.test(reviews[-1000:],labels[-1000:])

# Further Noise Reduction

In [ ]:
Image(filename='sentiment_network_sparse_2.png')
In [ ]:
# words most frequently seen in a review with a "POSITIVE" label pos_neg_ratios.most_common()
In [ ]:
# words most frequently seen in a review with a "NEGATIVE" label list(reversed(pos_neg_ratios.most_common()))[0:30]
In [ ]:
from bokeh.models import ColumnDataSource, LabelSet from bokeh.plotting import figure, show, output_file from bokeh.io import output_notebook output_notebook()
In [ ]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True) p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="Word Positive/Negative Affinity Distribution") p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555") show(p)
In [ ]:
frequency_frequency = Counter() for word, cnt in total_counts.most_common(): frequency_frequency[cnt] += 1
In [ ]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True) p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="The frequency distribution of the words in our corpus") p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555") show(p)

# Project 6: Reducing Noise by Strategically Reducing the Vocabulary

TODO: Improve SentimentNetwork's performance by reducing more noise in the vocabulary. Specifically, do the following:

• Copy the SentimentNetwork class from the previous project into the following cell.
• Modify pre_process_data:
• Calculate the positive-to-negative ratios of words used in the reviews. (You can use code you've written elsewhere in the notebook, but we are moving it into the class like we did with other helper code earlier.)
• Andrew's solution only calculates a postive-to-negative ratio for words that occur at least 50 times. This keeps the network from attributing too much sentiment to rarer words. You can choose to add this to your solution if you would like.
• Change so words are only added to the vocabulary if they occur in the vocabulary more than min_count times.
• Change so words are only added to the vocabulary if the absolute value of their postive-to-negative ratio is at least polarity_cutoff
• Modify __init__:
• Add the same two parameters (min_count and polarity_cutoff) and use them when you call pre_process_data
In [ ]:
# TODO: -Copy the SentimentNetwork class from Project 5 lesson # -Modify it according to the above instructions

Run the following cell to train your network with a small polarity cutoff.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01) mlp.train(reviews[:-1000],labels[:-1000])

And run the following cell to test it's performance. It should be

In [ ]:
mlp.test(reviews[-1000:],labels[-1000:])

Run the following cell to train your network with a much larger polarity cutoff.

In [ ]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01) mlp.train(reviews[:-1000],labels[:-1000])

And run the following cell to test it's performance.

In [ ]:
mlp.test(reviews[-1000:],labels[-1000:])

# Analysis: What's Going on in the Weights?

In [ ]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
In [ ]:
mlp_full.train(reviews[:-1000],labels[:-1000])
In [ ]:
Image(filename='sentiment_network_sparse.png')
In [ ]:
def get_most_similar_words(focus = "horrible"): most_similar = Counter() for word in mlp_full.word2index.keys(): most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]]) return most_similar.most_common()
In [ ]:
get_most_similar_words("excellent")
In [ ]:
get_most_similar_words("terrible")
In [ ]:
import matplotlib.colors as colors words_to_visualize = list() for word, ratio in pos_neg_ratios.most_common(500): if(word in mlp_full.word2index.keys()): words_to_visualize.append(word) for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]: if(word in mlp_full.word2index.keys()): words_to_visualize.append(word)
In [ ]:
pos = 0 neg = 0 colors_list = list() vectors_list = list() for word in words_to_visualize: if word in pos_neg_ratios.keys(): vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]]) if(pos_neg_ratios[word] > 0): pos+=1 colors_list.append("#00ff00") else: neg+=1 colors_list.append("#000000")
In [ ]:
from sklearn.manifold import TSNE tsne = TSNE(n_components=2, random_state=0) words_top_ted_tsne = tsne.fit_transform(vectors_list)
In [ ]:
p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="vector T-SNE for most polarized words") source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0], x2=words_top_ted_tsne[:,1], names=words_to_visualize, color=colors_list)) p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color") word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(word_labels) show(p) # green indicates positive words, black indicates negative words