testing

Path: MITx15.071x-AnalyticsEdge/w6-tweets.sagews

Views: ³⁶¹

%auto
%default_mode r
typeset_mode(True,display = True)

W6 - Text Analytics

setwd(file.path(Sys.getenv("HOME"),'MITx15.071x-AnalyticsEdge'))
getwd()

[1] "/projects/db982efa-e439-4e2d-933b-7c7011c6b21a/MITx15.071x-AnalyticsEdge"

library(tm)

Loading required package: NLP

tweets = read.csv("tweets.csv", stringsAsFactors=FALSE)

str(tweets)

'data.frame':	1181 obs. of  2 variables:
 $ Tweet: chr  "I have to say, Apple has by far the best customer care service I have ever received! @Apple @AppStore" "iOS 7 is so fricking smooth & beautiful!! #ThanxApple @Apple" "LOVE U @APPLE" "Thank you @apple, loving my new iPhone 5S!!!!!  #apple #iphone5S pic.twitter.com/XmHJCU4pcb" ...
 $ Avg  : num  2 2 1.8 1.8 1.8 1.8 1.8 1.6 1.6 1.6 ...

tweets$Negative = as.factor(tweets$Avg <= -1)

table(tweets$Negative)

FALSE  TRUE 
  999   182 

library(SnowballC)

corpus = Corpus(VectorSource(tweets$Tweet))

# Look at corpus
corpus




# Convert to lower-case

corpus = tm_map(corpus, tolower)
corpus[[1]]
corpus = tm_map(corpus, PlainTextDocument)

<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 1181


[1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"

corpus[[1]]$content

[1] "i have to say, apple has by far the best customer care service i have ever received! @apple @appstore"

# Remove punctuation

corpus = tm_map(corpus, removePunctuation)

corpus[[1]]$content

[1] "i have to say apple has by far the best customer care service i have ever received apple appstore"

# Look at stop words
stopwords("english")[1:10]

 [1] "i"         "me"        "my"        "myself"    "we"        "our"       "ours"      "ourselves"
 [9] "you"       "your"     

length(stopwords("english"))

[1] 174

corpus = tm_map(corpus, removeWords, c("apple", stopwords("english")))

corpus[[1]]$content

[1] "   say    far  best customer care service   ever received  appstore"

corpus = tm_map(corpus, stemDocument)

corpus[[1]]$content

[1] "   say    far  best custom care servic   ever receiv  appstor"

frequencies = DocumentTermMatrix(corpus)

frequencies

<<DocumentTermMatrix (documents: 1181, terms: 3289)>>
Non-/sparse entries: 8980/3875329
Sparsity           : 100%
Maximal term length: 115
Weighting          : term frequency (tf)

inspect(frequencies[1000:1005,505:515])

<<DocumentTermMatrix (documents: 6, terms: 11)>>
Non-/sparse entries: 1/65
Sparsity           : 98%
Maximal term length: 9
Weighting          : term frequency (tf)

              Terms
Docs           cheapen cheaper check cheep cheer cheerio cherylcol chief chiiiiqu child children
  character(0)       0       0     0     0     0       0         0     0        0     0        0
  character(0)       0       0     0     0     0       0         0     0        0     0        0
  character(0)       0       0     0     0     0       0         0     0        0     0        0
  character(0)       0       0     0     0     0       0         0     0        0     0        0
  character(0)       0       0     0     0     0       0         0     0        0     0        0
  character(0)       0       0     0     0     1       0         0     0        0     0        0

# Check for sparsity

findFreqTerms(frequencies, lowfreq=20)

 [1] "android"              "anyon"                "app"                  "appl"                
 [5] "back"                 "batteri"              "better"               "buy"                 
 [9] "can"                  "cant"                 "come"                 "dont"                
[13] "fingerprint"          "freak"                "get"                  "googl"               
[17] "ios7"                 "ipad"                 "iphon"                "iphone5"             
[21] "iphone5c"             "ipod"                 "ipodplayerpromo"      "itun"                
[25] "just"                 "like"                 "lol"                  "look"                
[29] "love"                 "make"                 "market"               "microsoft"           
[33] "need"                 "new"                  "now"                  "one"                 
[37] "phone"                "pleas"                "promo"                "promoipodplayerpromo"
[41] "realli"               "releas"               "samsung"              "say"                 
[45] "store"                "thank"                "think"                "time"                
[49] "twitter"              "updat"                "use"                  "via"                 
[53] "want"                 "well"                 "will"                 "work"                

sparse = removeSparseTerms(frequencies, 0.995)
sparse

<<DocumentTermMatrix (documents: 1181, terms: 309)>>
Non-/sparse entries: 4669/360260
Sparsity           : 99%
Maximal term length: 20
Weighting          : term frequency (tf)

tweetsSparse = as.data.frame(as.matrix(sparse))

# Make all variable names R-friendly

colnames(tweetsSparse) = make.names(colnames(tweetsSparse))

tweetsSparse$Negative = tweets$Negative

library(caTools)

set.seed(123)

split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)

trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)

findFreqTerms(frequencies, lowfreq=100)

[1] "iphon" "itun"  "new"  

# Build a CART model

library(rpart)
library(rpart.plot)

tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")

prp(tweetCART)

predictCART = predict(tweetCART, newdata=testSparse, type="class")

table(testSparse$Negative, predictCART)

       predictCART
        FALSE TRUE
  FALSE   294    6
  TRUE     37   18

(294+18)/(294+6+37+18)

[1] 0.8788732

table(testSparse$Negative)

300/(300+55)

FALSE  TRUE 
  300    55 
[1] 0.8450704

library(randomForest)
set.seed(123)

tweetRF = randomForest(Negative ~ ., data=trainSparse)

# Make predictions:
predictRF = predict(tweetRF, newdata=testSparse)

table(testSparse$Negative, predictRF)

randomForest 4.6-10
Type rfNews() to see new features/changes/bug fixes.




       predictRF
        FALSE TRUE
  FALSE   293    7
  TRUE     34   21

# Accuracy:
(293+21)/(293+7+34+21)

[1] 0.884507

tweetLog  <- glm(Negative ~ ., data = trainSparse, family=binomial)

predictions = predict(tweetLog, newdata=testSparse, type="response")

Warning message:
In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type ==  :
  prediction from a rank-deficient fit may be misleading

# Confusion matrix with threshold of 0.5
#table(testSparse$over50k, predictTest > 0.5)
table(testSparse$Negative, predictions > 0.5)

       
        FALSE TRUE
  FALSE   253   47
  TRUE     27   28

(253+28)/(253+28+47+27)

[1] 0.7915493