CoCalc Public FilesHW3.ipynbOpen in with one click!
Authors: Laurens Castelijns, Yuri Maas, Freek Rooks
Views : 58
In [ ]:
In [1]:
for x in range(1, 5): print(x)
1 2 3 4
In [2]:
import pandas as pd

Import data

In [3]:
data = pd.read_csv('Web_Anlaytics_Nov_2018_final.csv', sep=None)
/usr/local/lib/python3.6/dist-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support sep=None with delim_whitespace=False; you can avoid this warning by specifying engine='python'. if __name__ == '__main__':
In [4]:
data.head()
Screen Resolution Browser Browser Version Device Type Device OS OS Version User Agent Traffic Source Combination Id ... Returning Visitor Hit Time User Language URL Referring URL City Region Country Goal 1 Converted Goal 1 Converted Time
0 360x640 Chrome Mobile 64.0.3282 Mobile Samsung SM-G935F Android 7 Mozilla/5.0 (Linux Android 7.0 SM-G935F Build/... search_traffic 1 ... False 2018-03-22T08:47:39 en-gb https://www.mastersportal.eu/studies/56292/pha... http://www.google.com/ Bangalore Karnataka India NaN NaN
1 1366x768 Chrome 64.0.3282 Desktop Other Windows 10 NaN Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleW... unknown 1 ... True 2018-03-22T08:47:40 tr https://www.mastersportal.com/studies/97603/cl... https://www.mastersportal.com/search/ Istanbul Istanbul Turkey NaN NaN
2 1280x800 Chrome 65.0.3325 Tablet Samsung SM-T580 Android 7 Mozilla/5.0 (Linux Android 7.0 SM-T580 Build/N... unknown 2 ... False 2018-03-22T08:47:42 es https://www.mastersportal.com/ https://www.mastersportal.com/?utm_source=goog... Barcelona Catalonia Spain NaN NaN
3 393x786 Chrome Mobile 65.0.3325 Mobile BND-AL10 Android 7 Mozilla/5.0 (Linux Android 7.0 BND-AL10 Build/... unknown 1 ... True 2018-03-22T08:47:41 en-gb https://www.mastersportal.com/universities/181... https://www.mastersportal.com/ranking-country/... Noida Uttar Pradesh India NaN NaN
4 1366x768 Vivaldi 1.95.1077 Desktop Other Windows 10 NaN Mozilla/5.0 (Windows NT 10.0 WOW64) AppleWebKi... search_traffic 1 ... False 2018-03-22T08:47:42 en-us https://www.mastersportal.com/articles/2097/mo... https://www.google.com/ Hanoi An Giang Vietnam NaN NaN

5 rows × 22 columns

Assignment 1

In [5]:
data.columns
Index(['Screen Resolution', 'Browser', 'Browser Version', 'Device Type', 'Device', 'OS', 'OS Version', 'User Agent', 'Traffic Source', 'Combination Id', 'Converted', 'Conversion Time', 'Returning Visitor', 'Hit Time', 'User Language', 'URL', 'Referring URL', 'City', 'Region', 'Country', 'Goal 1 Converted', 'Goal 1 Converted Time'], dtype='object')
In [6]:
data[['Combination Id', 'Converted']].head()
Combination Id Converted
0 1 No
1 1 No
2 2 No
3 1 No
4 1 No
In [7]:
# convert hit time to day of the wee import datetime data['Hit Time'] = pd.to_datetime(data['Hit Time']) data['Hit Time'] = data['Hit Time'].dt.day_name()
In [8]:
data = data.drop(columns = ['Region','Screen Resolution', 'OS Version', 'Device', 'Browser Version', 'Goal 1 Converted', 'Goal 1 Converted Time', 'User Agent', 'URL', 'Referring URL', 'City', 'Conversion Time'])
In [9]:
data.head()
Browser Device Type OS Traffic Source Combination Id Converted Returning Visitor Hit Time User Language Country
0 Chrome Mobile Mobile Android search_traffic 1 No False Thursday en-gb India
1 Chrome Desktop Windows 10 unknown 1 No True Thursday tr Turkey
2 Chrome Tablet Android unknown 2 No False Thursday es Spain
3 Chrome Mobile Mobile Android unknown 1 No True Thursday en-gb India
4 Vivaldi Desktop Windows 10 search_traffic 1 No False Thursday en-us Vietnam

Assignment 2

In [10]:
class Queue: """Create a normal queue""" def __init__(self) -> None: self.queue = [] def isempty(self) -> bool: return len(self.queue) == 0 def len(self): return len(self.queue) def enqueue(self, element: dict) -> None: self.queue.append(element) def dequeue(self) -> dict: return self.queue.pop()
In [11]:
class PriorityQueue: """Create a priority queue (Ordered list with limited spaces)""" def __init__(self, max_size: int): self.max_size = max_size self.queue = [] def __repr__(self) -> str: return str(self.queue) def isempty(self) -> bool: return len(self.queue) == 0 def insert_with_priority(self, quality: float, description: dict) -> None: added = False for i in range(len(self.queue)): if self.queue[i][0] < quality: self.queue = self.queue[:i] + [(quality, description)] + self.queue[i:] added = True break if not added: self.queue = self.queue + [(quality, description)] self.queue = self.queue[:self.max_size] def get_front_element(self) -> dict: """Returns and removes an element""" #self.queue.reverse() It's not required to return the best one first, we can also start with the worst one front_element = self.queue.pop() #self.queue.reverse() # Only return the description, not the associated quality return front_element[1]
In [12]:
def create_subset(description, new_subset): """ Create a subset of the dataset using the description """ # Create a subset using all descriptors for attribute in description: # For numerical attributes, check if the value is bigger or smaller than splitpoint if 'groot_' in attribute: new_subset = new_subset[new_subset[attribute[6:]] >= description[attribute]] elif 'klein_' in attribute: new_subset = new_subset[new_subset[attribute[6:]] <= description[attribute]] # For nominal attributes, check if the value is present or not elif 'goed_' in attribute: new_subset = new_subset[new_subset[attribute[5:]] == description[attribute]] elif 'fout_' in attribute: new_subset = new_subset[new_subset[attribute[5:]] != description[attribute]] # For binary attributes, check if the value is present else: new_subset = new_subset[new_subset[attribute] == description[attribute]] # Return the subset return new_subset
In [13]:
def quality_measure(description: dict, dataset: pd.core.frame.DataFrame, targets: list, evaluation_method: callable, weights: dict = None) -> float: """ Create a subset of the dataset using the description Evaluate this subset and return the quality measure. """ subset = create_subset(description, dataset) # Evaluate the subset if not weights is None: return evaluation_method(subset, targets, description, weights) return evaluation_method(subset, targets)
In [14]:
import math class Evaluation: """ Different evaluation methods """ def basic_Yuri_evaluation(subset: pd.core.frame.DataFrame, targets: list) -> float: """Evaluation based on total conversions Requires a 'Converted' and 'Combination Id' column in the dataset""" conversions = 0 for row_num, info in subset.iterrows(): if info['Converted'] == 'Yes' and info['Combination Id'] == 1: conversions += 1 elif info['Converted'] == 'Yes' and info['Combination Id'] == 2: conversions += 1 # Current quality = Amount of conversions / Amount of test subjects in subset NOTE: For full dataset: 4470 / 344785 return conversions/(subset.size / subset.columns.size) def Yules_Q(subset: pd.core.frame.DataFrame, targets: list) -> float: subset = subset[targets] combinations = data.groupby(targets).count() n1 = subset[(subset[targets[0]] == combinations.index[0][0]) & (subset[targets[1]] == combinations.index[0][1])].size n2 = subset[(subset[targets[0]] == combinations.index[2][0]) & (subset[targets[1]] == combinations.index[2][1])].size n3 = subset[(subset[targets[0]] == combinations.index[1][0]) & (subset[targets[1]] == combinations.index[1][1])].size n4 = subset[(subset[targets[0]] == combinations.index[3][0]) & (subset[targets[1]] == combinations.index[3][1])].size Q = (n1 * n4 - n2 * n3) / (n1 * n4 + n2 * n3) subgroup_n = subset.shape[0] N = data.shape[0] complement_n = N - subgroup_n if subgroup_n == 0 or complement_n == 0: entropy = 0 else: entropy = - subgroup_n / N * math.log(subgroup_n / N) - complement_n / N * math.log(complement_n / N) return Q * entropy def weighted_yules_q(subset: pd.core.frame.DataFrame, targets: list, description: dict, weights: dict) -> float: """Takes into account the weights given to the attributes on which the data is subsetted""" total_weight = 1 # product of attribute weights for attribute in weights.keys(): for attribute_description in description.keys(): if attribute in attribute_description: total_weight *= weights[attribute] return Evaluation.Yules_Q(subset, targets) * total_weight # call to original Yules_Q(), multiplied with the weight
In [15]:
def refinement(description: dict, dataset: pd.core.frame.DataFrame, targets: list, bins: int) -> list: """ Refine the description by adding a condition to it Output: Set of descriptions """ # Create a subset so that only possible subsets are considered for refinement subset = create_subset(description, dataset) refined_set_of_descriptions = [] for attribute in subset: if attribute in description or attribute in targets: continue # Types of data: Numerical, Nominal, Binary. BTW everything is a string in the pd dataset (except NaN) # For continues datatypes, create bins with equal amounts of test subjects in between cut-offs if False: # type(subset[attribute][0]) == 'numerical': # Do we even have numerical data? list_of_values = [value for value in subset[attribute]] sorted_values = sorted(list_of_values) n = len(sorted_values) # Create bins from 1 to bins-1 (Python already stops at bins-1) for i in range(bins): if i != 0: # Determine the place to split and find the value, add that as constraint to the description splitposition = int(i*(n/bins)) splitpoint = sorted_values[splitposition] refined_set_of_descriptions.append({**description, **{'klein_'+attribute: splitpoint}}) refined_set_of_descriptions.append({**description, **{'groot_'+attribute: splitpoint}}) elif len(subset[attribute].unique()) == 2: # For binary datatype, refine by adding every possible value as constraint to the description for binary_value in subset[attribute].unique(): refined_set_of_descriptions.append({**description, **{attribute: binary_value}}) else: # For nominal datatypes, refine by adding every possible value twice, once when it holds, and once when it doens't for nominal_value in subset[attribute].unique(): refined_set_of_descriptions.append({**description, **{'goed_'+attribute: nominal_value}}) refined_set_of_descriptions.append({**description, **{'fout_'+attribute: nominal_value}}) return refined_set_of_descriptions
In [16]:
def SATISFIESALL(description: dict, constraints: dict) -> bool: """Tests if the descriptions satisfies the set constraints, but do we have constraints?""" return True
In [17]:
def beamsearch(dataset: pd.core.frame.DataFrame, target_columns: list, result_size: int, beam_width: int, beam_depth: int, nr_bins: int, evaluation_method: callable, constraints: dict, weights: dict = None) -> PriorityQueue: """The Beam Search Function""" candidateQueue = Queue() # What do we still need to explore this level? candidateQueue.enqueue({}) resultSet = PriorityQueue(result_size) # What are the final best subsets/descriptions? # At every level, create a new beam to store the 'best' subsets for level in range(1, beam_depth): beam = PriorityQueue(beam_width) # What are the best subsets/descriptions found in this level currently? # If there are THINGS in the candidate queue, refine these to create a new set of descriptions while not candidateQueue.isempty(): print("Candidates still in queue this level: {}".format(candidateQueue.len())) seed = candidateQueue.dequeue() refined_set = refinement(seed, dataset, target_columns, nr_bins) # Find the associated quality for every refinement made for description in refined_set: quality = quality_measure(description, dataset, target_columns, evaluation_method, weights) # If the refinement is valid, add it to the beam and resultset if SATISFIESALL(description, constraints): # If the quality of the refinement is too low and the lists are full, it still won't be inserted resultSet.insert_with_priority(quality, description) beam.insert_with_priority(quality, description) # At the end of the level, put all beam (top-quality) descriptions in for next level while not beam.isempty(): candidateQueue.enqueue(beam.get_front_element()) print('End of level {}'.format(level)) # After all levels are done, return the q (result_size) best descriptions found return resultSet

Run beamsearch below

In [ ]:
beamsearch(dataset=data, target_columns=['Converted', 'Combination Id'], result_size=10, beam_width=20, beam_depth=4, nr_bins=20, evaluation_method=Evaluation.Yules_Q, constraints={})
In [17]:
weights = dict() weights['PS'] = 1.5 weights['Hit Time'] = 2 weights['Country'] = 0.5 beamsearch(dataset=data[:50], target_columns=['Converted', 'Combination Id'], result_size=10, beam_width=10, beam_depth=4, nr_bins=10, evaluation_method=Evaluation.weighted_yules_q, constraints = {}, weights = weights)
Candidates still in queue this level: 1
/usr/local/lib/python3.6/dist-packages/ipykernel/__main__.py:25: RuntimeWarning: invalid value encountered in long_scalars
End of level 1 Candidates still in queue this level: 10 Candidates still in queue this level: 9 Candidates still in queue this level: 8 Candidates still in queue this level: 7 Candidates still in queue this level: 6 Candidates still in queue this level: 5 Candidates still in queue this level: 4 Candidates still in queue this level: 3 Candidates still in queue this level: 2 Candidates still in queue this level: 1 End of level 2 Candidates still in queue this level: 10 Candidates still in queue this level: 9 Candidates still in queue this level: 8 Candidates still in queue this level: 7 Candidates still in queue this level: 6 Candidates still in queue this level: 5 Candidates still in queue this level: 4 Candidates still in queue this level: 3 Candidates still in queue this level: 2 Candidates still in queue this level: 1 End of level 3 Candidates still in queue this level: 10 Candidates still in queue this level: 9 Candidates still in queue this level: 8 Candidates still in queue this level: 7 Candidates still in queue this level: 6 Candidates still in queue this level: 5 Candidates still in queue this level: 4 Candidates still in queue this level: 3 Candidates still in queue this level: 2 Candidates still in queue this level: 1 End of level 4
[(nan, {'goed_Browser': 'Chrome Mobile'}), (nan, {'fout_Browser': 'Chrome Mobile'}), (nan, {'goed_Browser': 'Chrome'}), (nan, {'fout_Browser': 'Chrome'}), (nan, {'goed_Browser': 'Vivaldi'}), (nan, {'fout_Browser': 'Vivaldi'}), (nan, {'goed_Browser': 'Chrome Mobile iOS'}), (nan, {'fout_Browser': 'Chrome Mobile iOS'}), (nan, {'goed_Browser': 'Mobile Safari'}), (nan, {'fout_Browser': 'Mobile Safari'})]
In [ ]:
In [ ]:
In [ ]: