SharedMITx15.071x-AnalyticsEdge / w6-tweets.sagewsOpen in CoCalc
Author: Hal Snyder
Views : 33
Description: testing
%auto %default_mode r typeset_mode(True,display = True)

W6 - Text Analytics

setwd(file.path(Sys.getenv("HOME"),'MITx15.071x-AnalyticsEdge')) getwd()
[1] "/projects/db982efa-e439-4e2d-933b-7c7011c6b21a/MITx15.071x-AnalyticsEdge"
library(tm)
Loading required package: NLP
tweets = read.csv("tweets.csv", stringsAsFactors=FALSE) str(tweets)
'data.frame': 1181 obs. of 2 variables: $ Tweet: chr "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore" "iOS 7 is so fricking smooth & beautiful!! #ThanxApple @Apple" "LOVE U @APPLE" "Thank you @apple, loving my new iPhone 5S!!!!! #apple #iphone5S pic.twitter.com/XmHJCU4pcb" ... $ Avg : num 2 2 1.8 1.8 1.8 1.8 1.8 1.6 1.6 1.6 ...
tweets$Negative = as.factor(tweets$Avg <= -1) table(tweets$Negative)
FALSE TRUE 999 182
library(SnowballC)
corpus = Corpus(VectorSource(tweets$Tweet)) # Look at corpus corpus # Convert to lower-case corpus = tm_map(corpus, tolower) corpus[[1]] corpus = tm_map(corpus, PlainTextDocument)
<<VCorpus>> Metadata: corpus specific: 0, document level (indexed): 0 Content: documents: 1181 [1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
corpus[[1]]$content
[1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"
# Remove punctuation corpus = tm_map(corpus, removePunctuation) corpus[[1]]$content
[1] "i have to say apple has by far the best customer care service i have ever received apple appstore"
# Look at stop words stopwords("english")[1:10]
[1] "i" "me" "my" "myself" "we" "our" "ours" "ourselves" [9] "you" "your"
length(stopwords("english"))
[1] 174
corpus = tm_map(corpus, removeWords, c("apple", stopwords("english"))) corpus[[1]]$content
[1] " say far best customer care service ever received appstore"
corpus = tm_map(corpus, stemDocument) corpus[[1]]$content
[1] " say far best custom care servic ever receiv appstor"
frequencies = DocumentTermMatrix(corpus) frequencies
<<DocumentTermMatrix (documents: 1181, terms: 3289)>> Non-/sparse entries: 8980/3875329 Sparsity : 100% Maximal term length: 115 Weighting : term frequency (tf)
inspect(frequencies[1000:1005,505:515])
<<DocumentTermMatrix (documents: 6, terms: 11)>> Non-/sparse entries: 1/65 Sparsity : 98% Maximal term length: 9 Weighting : term frequency (tf) Terms Docs cheapen cheaper check cheep cheer cheerio cherylcol chief chiiiiqu child children character(0) 0 0 0 0 0 0 0 0 0 0 0 character(0) 0 0 0 0 0 0 0 0 0 0 0 character(0) 0 0 0 0 0 0 0 0 0 0 0 character(0) 0 0 0 0 0 0 0 0 0 0 0 character(0) 0 0 0 0 0 0 0 0 0 0 0 character(0) 0 0 0 0 1 0 0 0 0 0 0
# Check for sparsity findFreqTerms(frequencies, lowfreq=20)
[1] "android" "anyon" "app" "appl" [5] "back" "batteri" "better" "buy" [9] "can" "cant" "come" "dont" [13] "fingerprint" "freak" "get" "googl" [17] "ios7" "ipad" "iphon" "iphone5" [21] "iphone5c" "ipod" "ipodplayerpromo" "itun" [25] "just" "like" "lol" "look" [29] "love" "make" "market" "microsoft" [33] "need" "new" "now" "one" [37] "phone" "pleas" "promo" "promoipodplayerpromo" [41] "realli" "releas" "samsung" "say" [45] "store" "thank" "think" "time" [49] "twitter" "updat" "use" "via" [53] "want" "well" "will" "work"
sparse = removeSparseTerms(frequencies, 0.995) sparse
<<DocumentTermMatrix (documents: 1181, terms: 309)>> Non-/sparse entries: 4669/360260 Sparsity : 99% Maximal term length: 20 Weighting : term frequency (tf)
tweetsSparse = as.data.frame(as.matrix(sparse)) # Make all variable names R-friendly colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
tweetsSparse$Negative = tweets$Negative
library(caTools) set.seed(123) split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7) trainSparse = subset(tweetsSparse, split==TRUE) testSparse = subset(tweetsSparse, split==FALSE)
findFreqTerms(frequencies, lowfreq=100)
[1] "iphon" "itun" "new"
# Build a CART model library(rpart) library(rpart.plot) tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)
predictCART = predict(tweetCART, newdata=testSparse, type="class") table(testSparse$Negative, predictCART)
predictCART FALSE TRUE FALSE 294 6 TRUE 37 18
(294+18)/(294+6+37+18)
[1] 0.8788732
table(testSparse$Negative) 300/(300+55)
FALSE TRUE 300 55 [1] 0.8450704
library(randomForest) set.seed(123) tweetRF = randomForest(Negative ~ ., data=trainSparse) # Make predictions: predictRF = predict(tweetRF, newdata=testSparse) table(testSparse$Negative, predictRF)
randomForest 4.6-10 Type rfNews() to see new features/changes/bug fixes. predictRF FALSE TRUE FALSE 293 7 TRUE 34 21
# Accuracy: (293+21)/(293+7+34+21)
[1] 0.884507
tweetLog <- glm(Negative ~ ., data = trainSparse, family=binomial)
predictions = predict(tweetLog, newdata=testSparse, type="response")
Warning message: In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading
# Confusion matrix with threshold of 0.5 #table(testSparse$over50k, predictTest > 0.5) table(testSparse$Negative, predictions > 0.5)
FALSE TRUE FALSE 253 47 TRUE 27 28
(253+28)/(253+28+47+27)
[1] 0.7915493