Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Data Manipution Creation of databases + Netflix Problem

Views: 903
## AUTOMATION PROJECT FOR HOOCH Inc. (BOT) ## # Automation algorithm used to improve HOOCH Inc's marketing campaign on the optimal social network for the company: Instagram # Instagram is the ideal social network to market for HOOCH Inc. # -*- coding: utf-8 -*- """ Created on Tue Sep 24 22:18:00 2018 @author: Florent """ ## This the instagram bot. Its objective is to improve the influence of the company on the social media Instagram ## This replace potential employees due to its recursive nature, it doesn't need break, it can work indefinitely # Importing the libraries which will be used from selenium import webdriver from selenium.webdriver.common.keys import Keys # Defining the adequate indentation that will be used in the backend throughout the process def print_same_line(text): sys.stdout.write(text) sys.stdout.flush() sys.stdout.write('\r') sys.stdout.flush() # Defining the class class InstagramBot: def __init__(self, username, password): # self is an encapsulation module, it contains, transport various characteristics of the class self.username = username self.password = password self.driver = webdriver.Chrome() # Defining the browser. It is possible to use Firefox as well # Urgently close the browser if the program comes across any irregularity such as an unexpected element on the website def closeBrowser(self): self.driver.close() # Implementation of the login button def login(self): driver = self.driver # Open the Browser and connect to Internet driver.get("https://www.instagram.com/") # Go to Instagrams website time.sleep(2) # Keep emulating the human's behavior by not processing too fast login_button = driver.find_element_by_xpath("//a[@href='/accounts/login/?source=auth_switcher']") # Reaching to the login button, according to its Xpath login_button.click() # Clicking on the login button time.sleep(2) user_name_elem = driver.find_element_by_xpath("//input[@name='username']") user_name_elem.clear() # Clear anything that has been previously written on the username box user_name_elem.send_keys(self.username) passworword_elem = driver.find_element_by_xpath("//input[@name='password']") passworword_elem.clear() # Clear anything that has been previously written on the password box time.sleep(2) # Creation of the automation including scrolling down and clicking on the like button def like_photo(self, hashtag): time.sleep(2) # Time to sleep between different moves so the bot will mimic the human # Gathering photos pic_hrefs = [] for i in range(1, 7): # Generating a random number try: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) # Getting tags based on the hrefs HTML element hrefs_in_view = driver.find_elements_by_tag_name('a') # Finding relevant hrefs hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view if '.com/p/' in elem.get_attribute('href')] # Building list of unique photos [pic_hrefs.append(href) for href in hrefs_in_view if href not in pic_hrefs] except Exception: continue # Liking photos unique_photos = len(pic_hrefs) for pic_href in pic_hrefs: driver.get(pic_href) time.sleep(2) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") try: time.sleep(random.randint(2, 6)) like_button = lambda: driver.find_element_by_xpath('//span[@aria-label="Like"]').click() like_button().click() for second in reversed(range(0, random.randint(18, 28))): print_same_line("#" + hashtag + ': unique photos left: ' + str(unique_photos) + " | Sleeping " + str(second)) time.sleep(1) except Exception as e: time.sleep(2) unique_photos -= 1 username = "" # Enter your instagram username here password = "" # Enter your instagram password here ig = InstagramBot(username, password) hashtags = ['nyc'] # Enter an example of hashtag(s) here. A default example of hashtag is nyc while True: try: # Choose a random tag from the list of tags tag = random.choice(hashtags) ig.like_photo(tag) except Exception: ig.closeBrowser() time.sleep(60) ig = InstagramBot(username, password) ig.login() ## END OF THE CODES ## AWS DATABASES TESTING ## ## WEBSITE PERFORMANCE AND REACTION TESTER ## ## The Program I use to test the reatcion of my website to a high amount of requests in a short amount of time. My website is embedded on AWS ## # -*- coding: utf-8 -*- """ Created on Mon Apr 1 18:11:45 2019 @author: Florent """ from selenium import webdriver import time import random import sys class PipeLineSpeedTest: def __init__(self, username, password): # self is an encapsulation, it contains various characteristics of the class self.username = username self.password = password self.driver = webdriver.Chrome() # Defining the browser. It is possible to use Firefox Instead def closeBrowser(self): self.driver.close() def login(self): driver = self.driver # Self is used when you want to inherit only of a specific instances it's like locking an entire fonction into a specific instance driver.get(" Website still is in developpement, it doesn't so look good at this time.. ") # Enter wesite http address time.sleep(2) login_button = driver.find_element_by_xpath('//*[@id="rightTopnav"]/li[2]/div/a')# Programming browser to find and to click on the login button on the website page # find login_button.click() # click on the login button time.sleep(2) # give the browser some time so it can click onto the next page user_name_elem = driver.find_element_by_xpath('//*[@id="loginForm"]/input[1]') # fill the username and the password #find the username box to fill user_name_elem.clear() # clear anything that has been written on the box user_name_elem.send_keys(self.username) passworword_elem = driver.find_element_by_xpath('//*[@id="loginForm"]/input[2]') passworword_elem.clear() passworword_elem.send_keys(self.password) login_button1 = driver.find_element_by_xpath('//*[@id="loginForm"]/button')# Programming browser to find and to click on the login button on the website page # find login_button1.click() # First you find the button like on the previous line then you click on it time.sleep(2) # //*[@id="loginForm"]/input[1] # Xpath # //*[@id="loginForm"]/input[2] # Xpath username = "TEST" password = "BOT1" for i in range(1, 7): # getting 6 test.login() # Recursively sending the same request multiple times to test the bandwidth and to see how the server would react while processing a higher volume of requests in a short period ## END OF THE PROJECT ## ## DATA MINING PROJECT FOR HOOCH Inc. %r --- title: "Data Manipulation, and creation of DataBases for HOOCH Inc. using data scraping" author: "Florent Rudel" date: "Sep 05, 2018" output: pdf_document --- Programing language used: R (Studio) Objective: The objective of the project is to use Twitter data to study which high followers/high engagers accounts are likely to retweet a tweet by any nonprofit on our list. Summary: I have created an account on twitter in order to extract data for the study. After getting Twitters approval, I used ASPCA as username, to extract a sample of 10000 tweets (a larger amount of data could be extracted but I limited the number of tweets to 10000), with their profiles names, the number of retweets, the account ID, the number of time the accounts have been favorited, from Twitter. I stored the data under a ".csv" file in order to be able work with Microsoft Excel as well for a better visualization. The internship supervisor proposed me to pull a large volume of tweets from the most heavily followed celebrities and then look for all the handles of the top nonprofits. Choosing the celebrity Cristiano Ronaldo due to the larger amount of followers his account has, I pulled 10000 tweets. Reading couple of articles on R library, combining with your approach I found out we can perform an analysis of emojis in order to understand the level of engagement of celeb and Twitter VIPs. For example, we can create a database emojis and classify them in two categories: positive emojis (such as smileys, hearts etc), negative emojis (the broken heart emoji, the angry face emoji etc), and neutral emojis (let say the emojis which are neither positive nor negative). After that we can count and compare the number of positive and negative emojis present in the tweets database. Finally we can draw conclusions about the level of engagement (high or low) based on a standardized ratio we can create (amount of positive emojis/amount of negative emojis for example). Also, regarding the fact that we can combine R with Microsoft excel, after extracting and formatting the tweets from Twitter, it will be faster and simpler to identify, count, and classify the emojis by using Microsoft Excel. Another technique which can be used is: the study and the classification of type of words inside the tweets database. We could have three categories of words:the positive words (example: Great, Awesome...), the negative words (Bad, But, Terrible...), and the neutral words (let say the words which are neither positive nor negative). By combining Microsoft excel to R, the words can be identified and categorized in a faster and simpler way. In the meantime I have been thinking about alternatives techniques that can be used to understand the level of engagement. After many observations of several posts, I realized that: the more "Likes" a tweet has, the more likely people would comment or retweet this tweet. As well as these people who will eventually comment and retweet, will themselves draw the attention of other people, therefore they will drive the other people to like and retweet. These people might themselves want to draw attention on their own accounts by retweeting, or liking tweets which have a lot of likes and retweets (which are more likely celeb and Twitter VIPs tweets). The study is to be continued... Tools: R , Microsoft Excel. Programming language R. Microsoft Excel (combined) First step: install "TwitteR" package # Twitter Identification ```{r} #Call function library(twitteR) ``` Load Library ```{r} # Access parameters granted by twitter to my Twitter account this is basically account access encrypted parameters which are automatically generated by Twitter for each account interested in extracting data from Twitter api_key <- "58dA7SheWJ9SPDxjRkYje3byW" api_secret <- "gOc7bgypowZvKhrIDHGqRKjbWivquT8IAzweMnxXI83fA6KohZ" access_token <- "993088726774566912-fwOdThtEW1rRS1zwPvSFfCbZxpE8aiy" access_token_secret <- "ba5C8qUSNSWTEQkYreFtJsULcNjCBIy7mQIV3TFlzZzVL" setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret) 1 # Enter 1 on the console to get the authorization access ``` # Getting tweet from the 1000 first followers of the non-profit organisation called ASPCA ```{r} ASPCA <- getUser('ASPCA') ASPCA$getDescription() ASPCA$followersCount ASPCA$friendsCount ASPCA$getFriends(n=1000) ``` # Getting the 10000 most recent tweets, including retweets of the non-profit company named ASPCA and paste them into a dataframe in csv format which can be read by using Microsoft Excel ```{r} account <- "ASPCA" account.timeline <- userTimeline(account, n = 100, includeRts = TRUE) #includeRts = TRUE to extract retweets as well TrialDF <- twListToDF(account.timeline) file.timeline <- paste(account, "ASPCA.csv", sep = " ") write.csv(TrialDF, file.timeline) # creating the csv file ``` Read file # necessary step if we want to process the data using R instead of Microsoft Excel ```{r} ASPCATw <- read.csv(file.choose(), header = T) ``` # Creating a string with the extracted data ```{r} str(ASPCATw) ``` # Build corpus to process data in R ```{r} library(tm) corpus <- (ASPCATw$text) corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5]) # to inspect first five tweets ``` # Personnalizing the data cleaning ```{r} corpus <- tm_map(corpus, tolower) #corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeNumbers) inspect(corpus[1:100]) ``` # Getting tweets by using another approach, This one is more specific because it searches the Keyword 'ASPCA' in the entire database of Twitter ```{r} tweets <- searchTwitter('ASPCA', n=10, lang = 'en') TrialDF <- twListToDF(tweets) file.timeline <- paste(account, "tweet$s.csv", sep = "") write.csv(TrialDF, file.timeline) # creating the Excel file ``` # Reading the file R instead of Microsoft excel ```{r} ASPCAtweets <- read.csv(file.choose(), header = T) ``` Build corpus ```{r} corpus <- (ASPCAtweets$text) corpus <- Corpus(VectorSource(corpus)) inspect(corpus[1:5]) ``` # Focusing on a celebrity first then looking at his engagement level concerning the non-profit organisations ```{r} tweets <- searchTwitter('lounge', n=10, lang = 'en') # I have restricted the language of tweets to English because I was asked to see the interest for American non profit organisations TrialDF <- twListToDF(tweets) file.timeline <- paste(tweets, "Lounges.csv", sep = "") write.csv(TrialDF, file.timeline) # creating the Excel file ``` # Analysis Based on the different results of different approaches which are: first getting the company followers' name and study these followers' behavior for each case, and second focusing on a celebrity first then looking at his engagement level concerning the non-profit organisations the easiest approach appeared to be: focusing on one heavily followed celebrity then look for his engagement toward non-profit organisations. After extracting 10000 Cristiano Ronaldo account's tweets and parse them into a csv file, I Proceeded to an analysis of emojis. Creating a data base characters (and its corresponding to emojis, for instance the smiley emoji character is ":)"), it is easy to collect, count and classify the total amount of each emoji using Microsoft Excel. Knowing that for example a smiley correspond to the this character ":)" all we have to do is to initiate the research engine by pressing "Ctrl + F" then enter the emoji's character, for each positive emoji, as well as each negative emoji, finally extract the total number of each type of emoji. Based on the ratio (positive/negative emoji), we can know whether the celeb is (positively) engaged or not. I also pulled a list non the top non-profit organisations using Google. After getting the non profit organisation's name, I went back to my csv file, and initiated the research engine by pressing "Ctrl + F" then I entered each non-profit company's names to see if Cristiano Ronaldo has ever tweeted any keywords (such as the company's name, or the company's tweeter profil name) concerning these non-profits company over his timeline. # Creation of venues Algorithm Data_scraping for HOOCH Inc. # # Objective The objective is to create about 10000 venues ```{r} tweets <- searchTwitter('lounge', n=13000, lang = 'en') # I entered 13000 instead of 10000 because I need to overcome the issue of data duplication. I came across the issue of duplication of data. An extra 3000 tweets extraction is to provide an additional amount of tweets in the case of duplication. TrialDF <- twListToDF(tweets) file.timeline <- paste(account, "tweet$s.csv", sep = "") write.csv(TrialDF, file.timeline) ``` ## NETFLIX PROJECT ## # Programming language used: SageMath # Analysis # $1 Increase streaming service # As result of the project I found that I would advise Netflix not to consider the implementation of the $1 decrease because it would make Netflix lose money #S(t) = 1300000 *(1-.23*t)# Total amount of subscribers #p(t) = 8.99 + t # Monthly price of streaming subscription #P(t) = S(t) * p(t) # Revenue
#plot(P(t))
solve(diff(P(t),t)==0,t)
[t == (-10677/4600)]
#Maximal revenue n(P(-10677/4600))
1.32978459532609e7
#Sensitivity Analysis S(t,s) = 1300000 *(1-s*t) p(t) = 8.99 + t P(t,s) = S(t,s) * p(t)
solve(diff(P(t,s),t)==0,t)
[t == -1/200*(899*s - 100)/s]
tmax(s) = -1/200*(899*s - 100)/s
Stmax(s) = tmax.diff(s)*(s/tmax(s))
show(Stmax(s).full_simplify())
100899s100\displaystyle \frac{100}{899 \, s - 100}
n(Stmax(1300000))
8.55651651891131e-8
#Sensitivity analysis of the maximal revenue #SmaxRev = diff(P(-1/200*(899*s - 100)/s,s),s)*s/P(-1/200*(899*s - 100)/s,s)
#show(SmaxRev)
0.0307692307692308(32.5000000000000(899s+100)(1.00000000000000(899s100)s2+899.000000000000s)29217.5000000000(899s100)s+5.25330650000000×107)s(899s+100)(1.00000000000000(899s100)s1798.00000000000)\displaystyle -\frac{0.0307692307692308 \, {\left(-32.5000000000000 \, {\left(899 \, s + 100\right)} {\left(-\frac{1.00000000000000 \, {\left(899 \, s - 100\right)}}{s^{2}} + \frac{899.000000000000}{s}\right)} - \frac{29217.5000000000 \, {\left(899 \, s - 100\right)}}{s} + 5.25330650000000 \times 10^{7}\right)} s}{{\left(899 \, s + 100\right)} {\left(\frac{1.00000000000000 \, {\left(899 \, s - 100\right)}}{s} - 1798.00000000000\right)}}
#n(SmaxRev(1300000))
0.999999828869699
#SmaxRev(1) - SmaxRev(10677/4600) # I would advise Netflix not to consider the implementation of the $1 decrease because it would make Netflix lose money.
-0.108736388503510