CoCalc -- HW3.ipynb

Project: hw3

Views: ⁴⁴⁴

Kernel: Python 3 (system-wide)

In [0]:

In [1]:

for x in range(1, 5):
    print(x)

In [2]:

import pandas as pd

Import data

In [3]:

data = pd.read_csv('Web_Anlaytics_Nov_2018_final.csv', sep=None)

/usr/local/lib/python3.6/dist-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support sep=None with delim_whitespace=False; you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':

In [4]:

data.head()

	Screen Resolution	Browser	Browser Version	Device Type	Device	OS	OS Version	User Agent	Traffic Source	Combination Id	...	Returning Visitor	Hit Time	User Language	URL	Referring URL	City	Region	Country	Goal 1 Converted	Goal 1 Converted Time
0	360x640	Chrome Mobile	64.0.3282	Mobile	Samsung SM-G935F	Android	7	Mozilla/5.0 (Linux Android 7.0 SM-G935F Build/...	search_traffic	1	...	False	2018-03-22T08:47:39	en-gb	https://www.mastersportal.eu/studies/56292/pha...	http://www.google.com/	Bangalore	Karnataka	India	NaN	NaN
1	1366x768	Chrome	64.0.3282	Desktop	Other	Windows 10	NaN	Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleW...	unknown	1	...	True	2018-03-22T08:47:40	tr	https://www.mastersportal.com/studies/97603/cl...	https://www.mastersportal.com/search/	Istanbul	Istanbul	Turkey	NaN	NaN
2	1280x800	Chrome	65.0.3325	Tablet	Samsung SM-T580	Android	7	Mozilla/5.0 (Linux Android 7.0 SM-T580 Build/N...	unknown	2	...	False	2018-03-22T08:47:42	es	https://www.mastersportal.com/	https://www.mastersportal.com/?utm_source=goog...	Barcelona	Catalonia	Spain	NaN	NaN
3	393x786	Chrome Mobile	65.0.3325	Mobile	BND-AL10	Android	7	Mozilla/5.0 (Linux Android 7.0 BND-AL10 Build/...	unknown	1	...	True	2018-03-22T08:47:41	en-gb	https://www.mastersportal.com/universities/181...	https://www.mastersportal.com/ranking-country/...	Noida	Uttar Pradesh	India	NaN	NaN
4	1366x768	Vivaldi	1.95.1077	Desktop	Other	Windows 10	NaN	Mozilla/5.0 (Windows NT 10.0 WOW64) AppleWebKi...	search_traffic	1	...	False	2018-03-22T08:47:42	en-us	https://www.mastersportal.com/articles/2097/mo...	https://www.google.com/	Hanoi	An Giang	Vietnam	NaN	NaN

5 rows × 22 columns

Assignment 1

In [5]:

data.columns

Index(['Screen Resolution', 'Browser', 'Browser Version', 'Device Type',
       'Device', 'OS', 'OS Version', 'User Agent', 'Traffic Source',
       'Combination Id', 'Converted', 'Conversion Time', 'Returning Visitor',
       'Hit Time', 'User Language', 'URL', 'Referring URL', 'City', 'Region',
       'Country', 'Goal 1 Converted', 'Goal 1 Converted Time'],
      dtype='object')

In [6]:

data[['Combination Id', 'Converted']].head()

	Combination Id	Converted
0	1	No
1	1	No
2	2	No
3	1	No
4	1	No

In [7]:

# convert hit time to day of the wee
import datetime
data['Hit Time'] = pd.to_datetime(data['Hit Time'])
data['Hit Time'] = data['Hit Time'].dt.day_name()

In [8]:

data = data.drop(columns = ['Region','Screen Resolution', 'OS Version', 'Device', 'Browser Version', 'Goal 1 Converted', 'Goal 1 Converted Time', 'User Agent', 'URL', 'Referring URL', 'City', 'Conversion Time'])

In [9]:

data.head()

	Browser	Device Type	OS	Traffic Source	Combination Id	Converted	Returning Visitor	Hit Time	User Language	Country
0	Chrome Mobile	Mobile	Android	search_traffic	1	No	False	Thursday	en-gb	India
1	Chrome	Desktop	Windows 10	unknown	1	No	True	Thursday	tr	Turkey
2	Chrome	Tablet	Android	unknown	2	No	False	Thursday	es	Spain
3	Chrome Mobile	Mobile	Android	unknown	1	No	True	Thursday	en-gb	India
4	Vivaldi	Desktop	Windows 10	search_traffic	1	No	False	Thursday	en-us	Vietnam

Assignment 2

In [10]:

class Queue:
    """Create a normal queue"""

    def __init__(self) -> None:
        self.queue = []

    def isempty(self) -> bool:
        return len(self.queue) == 0
    
    def len(self):
        return len(self.queue)

    def enqueue(self, element: dict) -> None:
        self.queue.append(element)

    def dequeue(self) -> dict:
        return self.queue.pop()

In [11]:

class PriorityQueue:
    """Create a priority queue (Ordered list with limited spaces)"""

    def __init__(self, max_size: int):
        self.max_size = max_size
        self.queue = []

    def __repr__(self) -> str:
        return str(self.queue)

    def isempty(self) -> bool:
        return len(self.queue) == 0

    def insert_with_priority(self, quality: float, description: dict) -> None:
        added = False
        for i in range(len(self.queue)):
            if self.queue[i][0] < quality:
                self.queue = self.queue[:i] + [(quality, description)] + self.queue[i:]
                added = True
                break
        if not added:
            self.queue = self.queue + [(quality, description)]
        self.queue = self.queue[:self.max_size]

    def get_front_element(self) -> dict:
        """Returns and removes an element"""
        #self.queue.reverse()  It's not required to return the best one first, we can also start with the worst one
        front_element = self.queue.pop()
        #self.queue.reverse()
        # Only return the description, not the associated quality
        return front_element[1]

In [12]:

def create_subset(description, new_subset):
    """
    Create a subset of the dataset using the description
    """
    # Create a subset using all descriptors
    for attribute in description:
        # For numerical attributes, check if the value is bigger or smaller than splitpoint
        if 'groot_' in attribute:
            new_subset = new_subset[new_subset[attribute[6:]] >= description[attribute]]
        elif 'klein_' in attribute:
            new_subset = new_subset[new_subset[attribute[6:]] <= description[attribute]]
        # For nominal attributes, check if the value is present or not
        elif 'goed_' in attribute:
            new_subset = new_subset[new_subset[attribute[5:]] == description[attribute]]
        elif 'fout_' in attribute:
            new_subset = new_subset[new_subset[attribute[5:]] != description[attribute]]
        # For binary attributes, check if the value is present
        else:
            new_subset = new_subset[new_subset[attribute] == description[attribute]]
    # Return the subset
    return new_subset

In [13]:

def quality_measure(description: dict, dataset: pd.core.frame.DataFrame,
                    targets: list, evaluation_method: callable, weights: dict = None) -> float:
    """
    Create a subset of the dataset using the description
    Evaluate this subset and return the quality measure.
    """
    subset = create_subset(description, dataset)

    #  Evaluate the subset
    if not weights is None:
        return evaluation_method(subset, targets, description, weights)
    return evaluation_method(subset, targets)

In [14]:

import math
class Evaluation:
    """ Different evaluation methods """

    def basic_Yuri_evaluation(subset: pd.core.frame.DataFrame, targets: list) -> float:
        """Evaluation based on total conversions
        Requires a 'Converted' and 'Combination Id' column in the dataset"""
        conversions = 0
        for row_num, info in subset.iterrows():
            if info['Converted'] == 'Yes' and info['Combination Id'] == 1:
                conversions += 1
            elif info['Converted'] == 'Yes' and info['Combination Id'] == 2:
                conversions += 1
        # Current quality = Amount of conversions / Amount of test subjects in subset NOTE: For full dataset: 4470 / 344785
        return conversions/(subset.size / subset.columns.size)

    def Yules_Q(subset: pd.core.frame.DataFrame, targets: list) -> float:
        subset = subset[targets]
        combinations = data.groupby(targets).count()
        n1 = subset[(subset[targets[0]] == combinations.index[0][0]) & (subset[targets[1]] == combinations.index[0][1])].size
        n2 = subset[(subset[targets[0]] == combinations.index[2][0]) & (subset[targets[1]] == combinations.index[2][1])].size
        n3 = subset[(subset[targets[0]] == combinations.index[1][0]) & (subset[targets[1]] == combinations.index[1][1])].size
        n4 = subset[(subset[targets[0]] == combinations.index[3][0]) & (subset[targets[1]] == combinations.index[3][1])].size

        Q = (n1 * n4 - n2 * n3) / (n1 * n4 + n2 * n3)

        subgroup_n = subset.shape[0]
        N = data.shape[0]
        complement_n = N - subgroup_n

        if subgroup_n == 0 or complement_n == 0:
            entropy = 0
        else:
            entropy =  - subgroup_n / N * math.log(subgroup_n / N) - complement_n / N * math.log(complement_n / N)

        return Q * entropy

    def weighted_yules_q(subset: pd.core.frame.DataFrame, targets: list, description: dict, weights: dict) -> float:
        """Takes into account the weights given to the attributes on which the data is subsetted"""
        total_weight = 1  # product of attribute weights
        for attribute in weights.keys():
            for attribute_description in description.keys():
                if attribute in attribute_description:
                    total_weight *= weights[attribute]

        return Evaluation.Yules_Q(subset, targets) * total_weight  # call to original Yules_Q(), multiplied with the weight

In [15]:

def refinement(description: dict, dataset: pd.core.frame.DataFrame, targets: list, bins: int) -> list:
    """
    Refine the description by adding a condition to it
    Output: Set of descriptions
    """
    # Create a subset so that only possible subsets are considered for refinement
    subset = create_subset(description, dataset)

    refined_set_of_descriptions = []
    for attribute in subset:
        if attribute in description or attribute in targets:
            continue

        # Types of data: Numerical, Nominal, Binary. BTW everything is a string in the pd dataset (except NaN)
        # For continues datatypes, create bins with equal amounts of test subjects in between cut-offs
        if False:  # type(subset[attribute][0]) == 'numerical':  # Do we even have numerical data?
            list_of_values = [value for value in subset[attribute]]
            sorted_values = sorted(list_of_values)
            n = len(sorted_values)
            # Create bins from 1 to bins-1 (Python already stops at bins-1)
            for i in range(bins):
                if i != 0:
                    # Determine the place to split and find the value, add that as constraint to the description
                    splitposition = int(i*(n/bins))
                    splitpoint = sorted_values[splitposition]
                    refined_set_of_descriptions.append({**description, **{'klein_'+attribute: splitpoint}})
                    refined_set_of_descriptions.append({**description, **{'groot_'+attribute: splitpoint}})
        elif len(subset[attribute].unique()) == 2:
            # For binary datatype, refine by adding every possible value as constraint to the description
            for binary_value in subset[attribute].unique():
                refined_set_of_descriptions.append({**description, **{attribute: binary_value}})
        else:
            # For nominal datatypes, refine by adding every possible value twice, once when it holds, and once when it doens't
            for nominal_value in subset[attribute].unique():
                refined_set_of_descriptions.append({**description, **{'goed_'+attribute: nominal_value}})
                refined_set_of_descriptions.append({**description, **{'fout_'+attribute: nominal_value}})
    return refined_set_of_descriptions

In [16]:

def SATISFIESALL(description: dict, constraints: dict) -> bool:
    """Tests if the descriptions satisfies the set constraints, but do we have constraints?"""
    return True

In [17]:

def beamsearch(dataset: pd.core.frame.DataFrame,
               target_columns: list,
               result_size: int,
               beam_width: int,
               beam_depth: int,
               nr_bins: int,
               evaluation_method: callable,
               constraints: dict,
               weights: dict = None) -> PriorityQueue:
    """The Beam Search Function"""
    candidateQueue = Queue()  # What do we still need to explore this level?
    candidateQueue.enqueue({})
    resultSet = PriorityQueue(result_size)  # What are the final best subsets/descriptions?

    # At every level, create a new beam to store the 'best' subsets
    for level in range(1, beam_depth):
        beam = PriorityQueue(beam_width)   # What are the best subsets/descriptions found in this level currently?

        # If there are THINGS in the candidate queue, refine these to create a new set of descriptions
        while not candidateQueue.isempty():
            print("Candidates still in queue this level: {}".format(candidateQueue.len()))
            seed = candidateQueue.dequeue()
            refined_set = refinement(seed, dataset, target_columns, nr_bins)

            # Find the associated quality for every refinement made
            for description in refined_set:
                quality = quality_measure(description, dataset, target_columns, evaluation_method, weights)
                # If the refinement is valid, add it to the beam and resultset
                if SATISFIESALL(description, constraints):
                    # If the quality of the refinement is too low and the lists are full, it still won't be inserted
                    resultSet.insert_with_priority(quality, description)
                    beam.insert_with_priority(quality, description)

        # At the end of the level, put all beam (top-quality) descriptions in for next level
        while not beam.isempty():
            candidateQueue.enqueue(beam.get_front_element())
        print('End of level {}'.format(level))

    # After all levels are done, return the q (result_size) best descriptions found
    return resultSet

Run beamsearch below

In [0]:

beamsearch(dataset=data, 
           target_columns=['Converted', 'Combination Id'], 
           result_size=10, 
           beam_width=20, 
           beam_depth=4, 
           nr_bins=20, 
           evaluation_method=Evaluation.Yules_Q, 
           constraints={})

In [17]:

weights = dict()
weights['PS'] = 1.5
weights['Hit Time'] = 2
weights['Country'] = 0.5

beamsearch(dataset=data[:50],
           target_columns=['Converted', 'Combination Id'],
           result_size=10,
           beam_width=10,
           beam_depth=4,
           nr_bins=10,
           evaluation_method=Evaluation.weighted_yules_q, 
           constraints = {},
           weights = weights)

Candidates still in queue this level: 1

/usr/local/lib/python3.6/dist-packages/ipykernel/__main__.py:25: RuntimeWarning: invalid value encountered in long_scalars

End of level 1
Candidates still in queue this level: 10
Candidates still in queue this level: 9
Candidates still in queue this level: 8
Candidates still in queue this level: 7
Candidates still in queue this level: 6
Candidates still in queue this level: 5
Candidates still in queue this level: 4
Candidates still in queue this level: 3
Candidates still in queue this level: 2
Candidates still in queue this level: 1
End of level 2
Candidates still in queue this level: 10
Candidates still in queue this level: 9
Candidates still in queue this level: 8
Candidates still in queue this level: 7
Candidates still in queue this level: 6
Candidates still in queue this level: 5
Candidates still in queue this level: 4
Candidates still in queue this level: 3
Candidates still in queue this level: 2
Candidates still in queue this level: 1
End of level 3
Candidates still in queue this level: 10
Candidates still in queue this level: 9
Candidates still in queue this level: 8
Candidates still in queue this level: 7
Candidates still in queue this level: 6
Candidates still in queue this level: 5
Candidates still in queue this level: 4
Candidates still in queue this level: 3
Candidates still in queue this level: 2
Candidates still in queue this level: 1
End of level 4

[(nan, {'goed_Browser': 'Chrome Mobile'}), (nan, {'fout_Browser': 'Chrome Mobile'}), (nan, {'goed_Browser': 'Chrome'}), (nan, {'fout_Browser': 'Chrome'}), (nan, {'goed_Browser': 'Vivaldi'}), (nan, {'fout_Browser': 'Vivaldi'}), (nan, {'goed_Browser': 'Chrome Mobile iOS'}), (nan, {'fout_Browser': 'Chrome Mobile iOS'}), (nan, {'goed_Browser': 'Mobile Safari'}), (nan, {'fout_Browser': 'Mobile Safari'})]

In [0]:

In [0]:

In [0]: